In [41]:
import pandas as pd
import pickle
import numpy as np
from scipy import stats
from sklearn import preprocessing, decomposition, manifold

## 4. Feature Engineering

In [2]:
# read dt_all_cleaned
dt_all_cleaned = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_cleaned.csv")
dt_all_cleaned.shape

(8418, 342)

In [3]:
# read dt_all_eng
dt_all_eng = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_encoded.csv")
dt_all_eng.shape

(8418, 599)

In [4]:
# read cols_raw
cols_raw = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/cols_raw.pkl", "rb"))
len(cols_raw)

328

### 4.1 Outlier marker

In [5]:
def getOutlierMarker(dt, index_outlier = 883):
    # outlier
    dt_outlier = dt.loc[dt.index.values == index_outlier]
    # calc int_outlierMarker
    int_outlierMarker = np.zeros(dt.shape[0])
    for col in dt_outlier.columns.values:
        for val in dt_outlier[col].values:
            int_outlierMarker = int_outlierMarker + (dt[col].values == val).astype("int64")
    
    return int_outlierMarker

#### 4.1.1 OutlierMarker_Cat

In [6]:
cols_cat = dt_all_cleaned.drop(["ID", "y", "IsTrainTest"], axis = 1).select_dtypes(include = ['object']).columns.values
cols_cat

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype=object)

In [7]:
int_outlierMarker_cat = getOutlierMarker(dt_all_cleaned[cols_cat])

#### 4.1.2 OutlierMarker_Int

In [8]:
cols_int = dt_all_cleaned.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"], axis = 1).select_dtypes(include = ['int64']).columns.values
cols_int[:10]

array(['X10', 'single_train_dup_train_X11', 'X12', 'X13', 'X14',
       'dup_test_X15', 'X16', 'dup_train_X17', 'X18', 'X19'], dtype=object)

In [9]:
int_outlierMarker_int = getOutlierMarker(dt_all_cleaned[cols_int])

#### 4.1.3 OutlierMarker_All

In [10]:
cols_all = dt_all_cleaned.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"], axis = 1).columns.values
cols_all[:10]

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       'single_train_dup_train_X11'], dtype=object)

In [11]:
int_outlierMarker_all = getOutlierMarker(dt_all_cleaned[cols_all])

#### 4.1.4 OutlierMarker_X0

In [12]:
cols_X0 = "X0"
int_outlierMarker_x0 = getOutlierMarker(dt_all_cleaned[cols_X0].to_frame())

In [13]:
dt_all_eng.loc[:, "FeatEng_OutlierMarker_Cat"] = int_outlierMarker_cat
dt_all_eng.loc[:, "FeatEng_OutlierMarker_Int"] = int_outlierMarker_int
dt_all_eng.loc[:, "FeatEng_OutlierMarker_All"] = int_outlierMarker_all
dt_all_eng.loc[:, "FeatEng_OutlierMarker_X0"] = int_outlierMarker_x0

In [14]:
dt_all_eng.head()

Unnamed: 0,ID,y,X10,single_train_dup_train_X11,X12,X13,X14,dup_test_X15,X16,dup_train_X17,...,Encode_BinaryX8_0,Encode_BinaryX8_1,Encode_BinaryX8_2,Encode_BinaryX8_3,Encode_BinaryX8_4,Encode_Ordinal_X0,FeatEng_OutlierMarker_Cat,FeatEng_OutlierMarker_Int,FeatEng_OutlierMarker_All,FeatEng_OutlierMarker_X0
0,0,130.81,0,0,0,1,0,0,0,0,...,0,0,0,0,0,47.583043,1.0,271.0,272.0,0.0
1,6,88.53,0,0,0,0,0,0,0,0,...,0,0,0,0,0,47.583043,2.0,292.0,294.0,0.0
2,7,76.26,0,0,0,0,0,0,0,1,...,0,1,1,1,1,40.638304,1.0,266.0,267.0,0.0
3,9,80.62,0,0,0,0,0,0,0,0,...,1,0,1,0,1,40.638304,3.0,271.0,274.0,0.0
4,13,78.02,0,0,0,0,0,0,0,0,...,0,0,0,0,1,40.638304,2.0,275.0,277.0,0.0


### 4.2 Sum of binary cols

#### 4.2.1 Sum of all binary cols

In [15]:
sum_binary_all = dt_all_eng[cols_raw].sum(axis = 1)

#### 4.2.2 Sum of correlation-important binary cols

In [16]:
def corBin_Contin(dt, cols):
    dt_binary_pointbiserialr = pd.DataFrame()
    for col in cols:
        cor_pb = stats.pointbiserialr(dt[col].values, dt.y.values)
        dt_binary_pointbiserialr = pd.concat([dt_binary_pointbiserialr
                                             , pd.DataFrame({"Col": col
                                                            , "Cor": np.abs([cor_pb.correlation])
                                                            , "P": [cor_pb.pvalue]})])
    return dt_binary_pointbiserialr.sort_values("Cor", ascending = False)

In [19]:
dt_corBin_Cotin = corBin_Contin(dt_all_eng, cols_raw)

In [20]:
cols_binary_important_3 = dt_corBin_Cotin["Col"].values[:3]
cols_binary_important_5 = dt_corBin_Cotin["Col"].values[:5]
cols_binary_important_10 = dt_corBin_Cotin["Col"].values[:10]
cols_binary_important_20 = dt_corBin_Cotin["Col"].values[:20]
cols_binary_important_50 = dt_corBin_Cotin["Col"].values[:50]

In [21]:
sum_binary_important_3 = dt_all_eng[cols_binary_important_3].sum(axis = 1)
sum_binary_important_5 = dt_all_eng[cols_binary_important_5].sum(axis = 1)
sum_binary_important_10 = dt_all_eng[cols_binary_important_10].sum(axis = 1)
sum_binary_important_20 = dt_all_eng[cols_binary_important_20].sum(axis = 1)
sum_binary_important_50 = dt_all_eng[cols_binary_important_50].sum(axis = 1)

In [22]:
# SumBin_All
dt_all_eng.loc[:, "FeatEng_SumBin_All"] = sum_binary_all
# SumBin_Imp_X
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_3"] = sum_binary_important_3
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_5"] = sum_binary_important_5
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_10"] = sum_binary_important_10
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_20"] = sum_binary_important_20
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_50"] = sum_binary_important_50

### 4.3 Dimension reduction

In [25]:
# range
pp_range = preprocessing.MinMaxScaler()
mx_range = pp_range.fit_transform(dt_all_eng.drop(["ID", "y", "IsTrainTest"], axis = 1))
dt_all_eng = pd.DataFrame(mx_range, columns = dt_all_eng.drop(["ID", "y", "IsTrainTest"], axis = 1).columns.values)
dt_all_eng.shape

(8418, 606)

In [26]:
def featEng_dimRed(method, dt, n_component, cols, name_feature):
    if method == "PCA":
        dr = decomposition.PCA(n_components = n_component, random_state = 888)
    elif method == "ICA":
        dr = decomposition.FastICA(n_components = n_component, random_state = 888)
    elif method == "SVD":
        dr = decomposition.TruncatedSVD(n_components = n_component, random_state = 888)
    elif method == "FA":
        dr = decomposition.FactorAnalysis(n_components = n_component, random_state = 888)
    elif method == "TSNE":
        dr = manifold.TSNE(n_components = n_component, random_state = 888)
    mx = dr.fit_transform(dt[cols])
    dt = dt.join(pd.DataFrame(mx, columns = ["DR_" + method + "_" + name_feature + "_" + str(i) for i in range(1, n_component + 1)]))
    
    return dt

#### 4.3.1 PCA

##### 4.3.1.1 Raw binary cols

In [29]:
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 15, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 621)

##### 4.3.1.2 Encoded cat cols

In [30]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 15, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 636)

##### 4.3.1.3 Feature engineed cols

In [31]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 639)

##### 4.3.1.4 All cols

In [32]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 20, cols_all, "All")
dt_all_eng.shape

(8418, 659)

#### 4.3.2 ICA

##### 4.3.2.1 Raw binary cols

In [33]:
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 15, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 674)

##### 4.3.2.2 Encoded cat cols

In [34]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 15, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 689)

##### 4.3.2.3 Feature engineed cols

In [35]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 692)

##### 4.3.2.4 All cols

In [36]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 20, cols_all, "All")
dt_all_eng.shape

(8418, 712)

#### 4.3.3 SVD

##### 4.3.3.1 Raw binary cols

In [37]:
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 15, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 727)

##### 4.3.3.2 Encoded cat cols

In [38]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 15, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 742)

##### 4.3.3.3 Feature engineed cols

In [39]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 745)

##### 4.3.3.4 All cols

In [40]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 20, cols_all, "All")
dt_all_eng.shape

(8418, 765)

#### 4.3.4 FA

##### 4.3.4.1 Raw binary cols

In [42]:
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 15, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 780)

##### 4.3.4.2 Encoded cat cols

In [43]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 15, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 795)

##### 4.3.4.3 Feature engineed cols

In [44]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 798)

##### 4.3.4.4 All cols

In [45]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 20, cols_all, "All")
dt_all_eng.shape

(8418, 818)

#### 4.3.5 tsne

##### 4.3.5.1 Raw binary cols

In [46]:
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 820)

##### 4.3.5.2 Encoded cat cols

In [47]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 822)

##### 4.3.5.3 Feature engineed cols

In [48]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 824)

##### 4.3.5.4 All cols

In [49]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 3, cols_all, "All")
dt_all_eng.shape

(8418, 827)

### 4.4 Save dt_all_eng

In [54]:
dt_all_eng = dt_all_eng.join(dt_all_cleaned[["ID", "y", "IsTrainTest"]])

In [None]:
dt_all_eng.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_eng.csv", index = False)