In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import StratifiedKFold
import category_encoders as ce
from sklearn import preprocessing

## 3. Encode cat cols

In [2]:
dt_all_encoded = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_cleaned.csv")
cols_cat = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/cols_cat.pkl", "rb"))
IDs_train = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/IDs_train.pkl", "rb"))
IDs_test = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/IDs_test.pkl", "rb"))

In [3]:
dt_train = dt_all_encoded.loc[dt_all_encoded["ID"].isin(IDs_train)]
dt_test = dt_all_encoded.loc[dt_all_encoded["ID"].isin(IDs_test)]

In [4]:
dt_all_encoded.shape

(8418, 338)

### 3.1 One-Hot Encoding

In [5]:
# dt_cat_onehot = pd.get_dummies(dt_all_encoded[cols_cat])
# dict_ohe = {x: "Encode_ohe_" + x for x in dt_cat_onehot.columns.values}
# dt_cat_onehot = dt_cat_onehot.rename(columns = dict_ohe)

### 3.2 TargetMean Encoding

In [6]:
def getTargetMean(dt_train, dt_test, col, k = 3, random_state = 888):
    if k == 1:
        # targetMean
        dt_targetMean = pd.DataFrame({col: dt_train.groupby([col])["y"].mean().index
                                      , "Encode_TargetMean_" + col: dt_train.groupby([col])["y"].mean()})
        # merge with test
        dt_train = pd.merge(dt_train, dt_targetMean, on = col, how = "left")
        dt_test = pd.merge(dt_test, dt_targetMean, on = col, how = "left")
        dt_test = dt_test.fillna(np.mean(dt_train.y))
        
    else:
        X_train_fold = pd.DataFrame()
        X_test_fold = pd.DataFrame()

        skf = StratifiedKFold(n_splits = k, random_state = random_state)
    
        for i, (ind_in, ind_out) in enumerate(skf.split(dt_train, dt_train[col].values)):
            X_in, X_out = dt_train.iloc[ind_in], dt_train.iloc[ind_out]
            # targetMean in
            dt_targetMean_fold = pd.DataFrame({col: X_in.groupby([col])["y"].mean().index
                                              , "Encode_TargetMean_" + col: X_in.groupby([col])["y"].mean()})
            # merge targetMean out
            X_out_fold = pd.merge(X_out, dt_targetMean_fold, on = col, how = "left")
            X_out_fold = X_out_fold.fillna(np.mean(X_in.y))

            # concat X_out_fold
            X_train_fold = pd.concat([X_train_fold, X_out_fold])

            # merge with test
            dt_targetMean_fold = dt_targetMean_fold.rename(columns = {"Encode_TargetMean_" + col: "Encode_TargetMean_fold_" + col + "_" + str(i)})
            if i == 0:
                X_test_fold = pd.merge(dt_test, dt_targetMean_fold, on = col, how = "left")
            else:
                X_test_fold = pd.merge(X_test_fold, dt_targetMean_fold, on = col, how = "left")

            # mean for test
            cols_encode_fold = X_test_fold.filter(regex = "Encode_TargetMean_fold_").columns.values
            X_test_fold["Encode_TargetMean_" + col] = X_test_fold[cols_encode_fold].mean(axis = 1)
            X_test_fold = X_test_fold.drop(cols_encode_fold, axis = 1)
            X_test_fold = X_test_fold.fillna(np.mean(X_in.y))
    
    return X_train_fold, X_test_fold 


In [7]:
for col in cols_cat:
    dt_train, dt_test = getTargetMean(dt_train, dt_test, col, 5)



In [8]:
dt_all_encoded = pd.concat([dt_train, dt_test])

In [9]:
dt_all_encoded.shape

(8418, 346)

### 3.3 Frequency Encoding

In [10]:
def getFrequency(dt, cols):
    # calc the cols freq
    dt_cat_freq = pd.DataFrame()
    for col in cols:
        dt_col_freq_temp = pd.DataFrame({"Col": col
                                         , "Value": dt[col].value_counts().index.values
                                         , "Freq": dt[col].value_counts().values})

        dt_cat_freq = pd.concat([dt_cat_freq, dt_col_freq_temp])

    # merge to origin
    dt_cat_cols = dt[cols + ["ID"]]
    for col in cols:
        dt_cat_cols = pd.merge(dt_cat_cols, dt_cat_freq.loc[dt_cat_freq["Col"] == col]
                           , how = "left", left_on = col, right_on = "Value")
        dt_cat_cols = dt_cat_cols.drop(["Value", "Col"], axis = 1)
        dt_cat_cols = dt_cat_cols.rename(columns = {"Freq": "Encode_Freq_" + col})

    dt_cat_cols = dt_cat_cols.filter(regex = "Encode_Freq_|ID")
    
    return dt_cat_cols

In [11]:
# dt_cat_freq = getFrequency(dt_all_encoded, cols_cat)
# dt_all_encoded = pd.merge(dt_all_encoded, dt_cat_freq, on = "ID", how = "left")

In [12]:
dt_all_encoded.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X383,X384,Encode_TargetMean_X0,Encode_TargetMean_X1,Encode_TargetMean_X2,Encode_TargetMean_X3,Encode_TargetMean_X4,Encode_TargetMean_X5,Encode_TargetMean_X6,Encode_TargetMean_X8
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,98.06625,101.995276,101.4675,102.819176,100.450401,100.701997,101.179422,97.169308
1,6,88.53,k,t,av,e,d,y,l,o,...,0,0,98.06625,92.463333,97.836667,99.137846,100.450401,100.701997,98.317408,97.169308
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,78.079643,94.537317,82.823119,101.850283,100.450401,80.62,101.179422,99.050595
3,13,78.02,az,v,n,f,d,h,d,n,...,0,0,78.079643,101.995276,82.823119,96.019651,100.450401,100.701997,101.57632,102.15228
4,18,92.93,t,b,e,c,d,g,h,s,...,0,0,93.426311,99.698816,96.98375,101.850283,100.450401,100.701997,102.105658,98.99201


In [13]:
dt_all_encoded.shape

(8418, 346)

### 3.4 Binary

In [14]:
encode_binary = ce.BinaryEncoder(cols_cat)
dt_cat_binary = encode_binary.fit_transform(dt_all_encoded[cols_cat])
dt_cat_binary.columns = "Encode_Binary_" + dt_cat_binary.columns

  X[col] = X[col].astype(int).reshape(-1, )
  X[switch.get('col')] = X[switch.get('col')].astype(int).reshape(-1, )


In [15]:
# dt_all_encoded = dt_all_encoded.join(dt_cat_binary)

In [16]:
dt_all_encoded.shape

(8418, 346)

### 3.5 Others

In [17]:
def encodeOthers(dt, cols_cat, method):
    if method == "Backward":
        encode = ce.BackwardDifferenceEncoder(cols_cat)
    elif method == "Polynomial":
        encode = ce.PolynomialEncoder(cols_cat)
    elif method == "Helmert":
        encode = ce.HelmertEncoder(cols_cat)
    elif method == "Sum":
        encode = ce.SumEncoder(cols_cat)
    
    encode.fit(dt[cols_cat])
    dt_cat_other = encode.transform(dt[cols_cat])
    dt_cat_other.columns = "Encode_" + method + "_" + dt_cat_other.columns
    
    return dt_cat_other

In [18]:
# # methods = ["Backward", "Polynomial", "Helmert", "Sum"]
# methods = ["Backward"]
# for method in methods:
#     dt_all_encoded = dt_all_encoded.join(encodeOthers(dt_all_encoded, cols_cat, method))

In [19]:
dt_all_encoded.shape

(8418, 346)

### 3.6 Label

In [20]:
for c in cols_cat:
    x = list(set(dt_all_encoded[c].values)) 
    x.sort()
    x.sort(key = len)
    dt_labelEncode_c = pd.DataFrame({"Encode_Label_" + c: [i for i in range(1, (len(x) + 1))]
                                     , c: x})

    dt_all_encoded = pd.merge(dt_all_encoded, dt_labelEncode_c, on = c)

In [21]:
dt_all_encoded.shape

(8418, 354)

### 3.7 Save dt_all_encoded

In [22]:
dt_all_encoded.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_encoded.csv", index = False)