In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import StratifiedKFold
import category_encoders as ce
from sklearn import preprocessing

## 3. Encode cat cols

In [2]:
dt_all_encoded = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_cleaned.csv")
IDs_train = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/IDs_train.pkl", "rb"))
IDs_test = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/IDs_test.pkl", "rb"))

In [3]:
# cols_cat
cols_cat = dt_all_encoded.select_dtypes(include = ['object']).columns.values

In [4]:
dt_all_encoded.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X376,X377,X378,X379,X380,X383,X384,IsDupRow_All,IsDupRow_Cat,IsDupRow_Int
0,0,130.81,k,v,at,a,d,u,j,o,...,0,1,0,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,0,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,0,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


### 3.1 One-Hot Encoding

In [5]:
# dt_cat_onehot = pd.get_dummies(dt_all_encoded[cols_cat])
# dict_ohe = {x: "Encode_ohe_" + x for x in dt_cat_onehot.columns.values}
# dt_cat_onehot = dt_cat_onehot.rename(columns = dict_ohe)

### 3.2 TargetMean Encoding

In [6]:
def getTargetMean(dt, IDs_train, IDs_test, col, k = 3, random_state = 888):
    dt_col_y_train = dt[[col, "y", "ID"]].loc[dt["ID"].isin(IDs_train)]
    dt_col_y_test = dt[[col, "y", "ID"]].loc[dt["ID"].isin(IDs_test)]
    
    if k == 1:
        # targetMean
        dt_targetMean = pd.DataFrame({col: dt_col_y_train.groupby([col])["y"].mean().index
                                      , "Encode_TargetMean_" + col: dt_col_y_train.groupby([col])["y"].mean()})
        # merge with test
        dt_col_y_train = pd.merge(dt_col_y_train, dt_targetMean, on = col, how = "left")
        dt_col_y_test = pd.merge(dt_col_y_test, dt_targetMean, on = col, how = "left")
        dt_col_y_test = dt_col_y_test.fillna(np.mean(dt_col_y_train.y))
        
        # remove col and y
        dt_col_y_train = dt_col_y_train.drop([col, "y"], axis = 1)
        dt_col_y_test = dt_col_y_test.drop([col, "y"], axis = 1)
        
    else:
        X_train_fold = pd.DataFrame()
        X_test_fold = pd.DataFrame()

        skf = StratifiedKFold(n_splits = k, random_state = random_state)
    
        for i, (ind_in, ind_out) in enumerate(skf.split(dt_col_y_train, dt_col_y_train[col].values)):
            X_in, X_out = dt_col_y_train.iloc[ind_in], dt_col_y_train.iloc[ind_out]
            # targetMean in
            dt_targetMean_fold = pd.DataFrame({col: X_in.groupby([col])["y"].mean().index
                                              , "Encode_TargetMean_" + col: X_in.groupby([col])["y"].mean()})
            # merge targetMean out
            X_out_fold = pd.merge(X_out, dt_targetMean_fold, on = col, how = "left")
            X_out_fold = X_out_fold.fillna(np.mean(X_in.y))

            # concat X_out_fold
            X_train_fold = pd.concat([X_train_fold, X_out_fold])

            # merge with test
            dt_targetMean_fold = dt_targetMean_fold.rename(columns = {"Encode_TargetMean_" + col: "Encode_TargetMean_" + col + "_" + str(i)})
            if i == 0:
                X_test_fold = pd.merge(dt_col_y_test, dt_targetMean_fold, on = col, how = "left")
            else:
                X_test_fold = pd.merge(X_test_fold, dt_targetMean_fold, on = col, how = "left")

            # mean for test
            cols_encode_fold = X_test_fold.filter(regex = "Encode_TargetMean_").columns.values
            X_test_fold["Encode_TargetMean_" + col] = X_test_fold[cols_encode_fold].mean(axis = 1)
            X_test_fold = X_test_fold.drop(cols_encode_fold, axis = 1)
            X_test_fold = X_test_fold.fillna(np.mean(X_in.y))

        # remove col and y
        dt_col_y_train = X_train_fold.drop([col, "y"], axis = 1)
        dt_col_y_test = X_test_fold.drop([col, "y"], axis = 1)
        
    dt_col_y = pd.concat([dt_col_y_train, dt_col_y_test])
    
    return dt_col_y


In [7]:
for col in cols_cat:
    dt_targetMean = getTargetMean(dt_all_encoded, IDs_train, IDs_test, col, 5)
    dt_all_encoded = pd.merge(dt_all_encoded, dt_targetMean, on = ["ID"], how = "left")



In [8]:
dt_all_encoded.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,IsDupRow_Cat,IsDupRow_Int,Encode_TargetMean_X0,Encode_TargetMean_X1,Encode_TargetMean_X2,Encode_TargetMean_X3,Encode_TargetMean_X4,Encode_TargetMean_X5,Encode_TargetMean_X6,Encode_TargetMean_X8
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,98.06625,101.901196,101.4675,102.936733,100.547417,100.72217,101.442696,97.773462
1,6,88.53,k,t,av,e,d,y,l,o,...,0,0,98.06625,92.0175,97.836667,99.011462,100.547417,100.72217,98.52767,97.773462
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,78.079643,95.146829,82.823119,102.078158,100.547417,80.62,101.442696,98.000357
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,78.079643,92.0175,82.823119,95.896628,100.547417,76.26,98.52767,105.632444
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,78.079643,101.901196,82.823119,95.896628,100.547417,100.72217,101.31606,102.248031


In [9]:
dt_all_encoded.shape

(8418, 349)

### 3.3 Frequency Encoding

In [10]:
def getFrequency(dt, cols):
    # calc the cols freq
    dt_cat_freq = pd.DataFrame()
    for col in cols:
        dt_col_freq_temp = pd.DataFrame({"Col": col
                                         , "Value": dt[col].value_counts().index.values
                                         , "Freq": dt[col].value_counts().values})

        dt_cat_freq = pd.concat([dt_cat_freq, dt_col_freq_temp])

    # merge to origin
    dt_cat_cols = dt[cols_cat.tolist() + ["ID"]]
    for col in cols:
        dt_cat_cols = pd.merge(dt_cat_cols, dt_cat_freq.loc[dt_cat_freq["Col"] == col]
                           , how = "left", left_on = col, right_on = "Value")
        dt_cat_cols = dt_cat_cols.drop(["Value", "Col"], axis = 1)
        dt_cat_cols = dt_cat_cols.rename(columns = {"Freq": "Encode_Freq_" + col})

    dt_cat_cols = dt_cat_cols.filter(regex = "Encode_Freq_|ID")
    
    return dt_cat_cols

In [11]:
dt_cat_freq = getFrequency(dt_all_encoded, cols_cat)
dt_all_encoded = pd.merge(dt_all_encoded, dt_cat_freq, on = "ID", how = "left")

In [12]:
dt_all_encoded.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,Encode_TargetMean_X6,Encode_TargetMean_X8,Encode_Freq_X0,Encode_Freq_X1,Encode_Freq_X2,Encode_Freq_X3,Encode_Freq_X4,Encode_Freq_X5,Encode_Freq_X6,Encode_Freq_X8
0,0,130.81,k,v,at,a,d,u,j,o,...,101.442696,97.773462,23,844,9,916,8408,1,2041,332
1,6,88.53,k,t,av,e,d,y,l,o,...,98.52767,97.773462,23,49,5,321,8408,2,951,332
2,7,76.26,az,w,n,c,d,x,j,x,...,101.442696,98.000357,336,102,250,3842,8408,4,2041,215
3,9,80.62,az,t,n,f,d,x,l,e,...,98.52767,105.632444,336,49,250,2159,8408,4,951,499
4,13,78.02,az,v,n,f,d,h,d,n,...,101.31606,102.248031,336,844,250,2159,8408,3,1214,478


In [13]:
dt_all_encoded.shape

(8418, 357)

### 3.4 Binary

In [14]:
encode_binary = ce.BinaryEncoder(cols_cat)
dt_cat_binary = encode_binary.fit_transform(dt_all_encoded[cols_cat])
dt_cat_binary.columns = "Encode_Binary_" + dt_cat_binary.columns

  X[col] = X[col].astype(int).reshape(-1, )
  X[switch.get('col')] = X[switch.get('col')].astype(int).reshape(-1, )


In [15]:
dt_all_encoded = dt_all_encoded.join(dt_cat_binary)

In [16]:
dt_all_encoded.shape

(8418, 394)

### 3.5 Others

In [17]:
def encodeOthers(dt, cols_cat, method):
    if method == "Backward":
        encode = ce.BackwardDifferenceEncoder(cols_cat)
    elif method == "Polynomial":
        encode = ce.PolynomialEncoder(cols_cat)
    elif method == "Helmert":
        encode = ce.HelmertEncoder(cols_cat)
    elif method == "Sum":
        encode = ce.SumEncoder(cols_cat)
    
    encode.fit(dt[cols_cat].loc[dt_all_encoded["ID"].isin(IDs_train)], dt.loc[dt["ID"].isin(IDs_train)].y)
    dt_cat_other_train = encode.transform(dt[cols_cat].loc[dt["ID"].isin(IDs_train)])
    dt_cat_other_test = encode.transform(dt[cols_cat].loc[dt["ID"].isin(IDs_test)])
    dt_cat_other_train.columns = "Encode_" + method + "_" + dt_cat_other_train.columns
    dt_cat_other_test.columns = "Encode_" + method + "_" + dt_cat_other_test.columns
    
    return dt_cat_other_train, dt_cat_other_test

### 3.6 Label

In [18]:
dt_cat_train_y = dt_all_encoded[cols_cat].join(dt_all_encoded.y)

In [19]:
for c in cols_cat:
    x = list(set(list(dt_cat_train_y[c].values)))
    x.sort()
    x.sort(key = len)
    dt_labelEncode_c = pd.DataFrame({"Encode_Label_" + c: [i for i in range(1, (len(x) + 1))]
                                     , c: x})

    dt_all_encoded = pd.merge(dt_all_encoded, dt_labelEncode_c, on = c)

In [20]:
dt_all_encoded.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,Encode_Binary_X8_3,Encode_Binary_X8_4,Encode_Label_X0,Encode_Label_X1,Encode_Label_X2,Encode_Label_X3,Encode_Label_X4,Encode_Label_X5,Encode_Label_X6,Encode_Label_X8
0,0,130.81,k,v,at,a,d,u,j,o,...,1,1,11,22,45,1,4,20,10,15
1,5761,115.07,o,l,ae,f,d,p,j,o,...,1,1,15,12,30,6,4,15,10,15
2,5883,77.38,bc,v,ac,f,d,p,j,o,...,1,1,53,22,28,6,4,15,10,15
3,2932,108.06,ak,v,ak,f,d,ac,j,o,...,1,1,36,22,36,6,4,28,10,15
4,2849,0.0,s,aa,ay,g,d,ac,j,o,...,1,1,19,26,50,7,4,28,10,15


In [21]:
dt_all_encoded.shape

(8418, 402)

### 3.7 Save dt_all_encoded

In [22]:
dt_all_encoded.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_encoded.csv", index = False)