In [None]:
import pandas as pd
import pickle
import numpy as np
from scipy import stats
from sklearn import preprocessing, decomposition, manifold
from sklearn import random_projection
import operator

## 4. Feature Engineering

In [None]:
# read dt_all_cleaned
dt_all_cleaned = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_cleaned.csv")
dt_all_cleaned.shape

In [None]:
# read dt_all_eng
dt_all_eng = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_encoded.csv")
dt_all_eng.shape

In [None]:
# read cols_raw
cols_raw = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/cols_raw.pkl", "rb"))
len(cols_raw)

### 4.1 Outlier marker

In [None]:
def getOutlierMarker(dt, index_outlier = 883):
    # outlier
    dt_outlier = dt.loc[dt.index.values == index_outlier]
    # calc int_outlierMarker
    int_outlierMarker = np.zeros(dt.shape[0])
    for col in dt_outlier.columns.values:
        for val in dt_outlier[col].values:
            int_outlierMarker = int_outlierMarker + (dt[col].values == val).astype("int64")
    
    return int_outlierMarker

#### 4.1.1 OutlierMarker_Cat

In [None]:
cols_cat = dt_all_cleaned.drop(["ID", "y", "IsTrainTest"], axis = 1).select_dtypes(include = ['object']).columns.values
cols_cat

In [None]:
int_outlierMarker_cat = getOutlierMarker(dt_all_cleaned[cols_cat])

#### 4.1.2 OutlierMarker_Int

In [None]:
cols_int = dt_all_cleaned.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"], axis = 1).select_dtypes(include = ['int64']).columns.values
cols_int[:10]

In [None]:
int_outlierMarker_int = getOutlierMarker(dt_all_cleaned[cols_int])

#### 4.1.3 OutlierMarker_All

In [None]:
cols_all = dt_all_cleaned.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"], axis = 1).columns.values
cols_all[:10]

In [None]:
int_outlierMarker_all = getOutlierMarker(dt_all_cleaned[cols_all])

#### 4.1.4 OutlierMarker_X0

In [None]:
cols_X0 = "X0"
int_outlierMarker_x0 = getOutlierMarker(dt_all_cleaned[cols_X0].to_frame())

In [None]:
dt_all_eng.loc[:, "FeatEng_OutlierMarker_Cat"] = int_outlierMarker_cat
dt_all_eng.loc[:, "FeatEng_OutlierMarker_Int"] = int_outlierMarker_int
dt_all_eng.loc[:, "FeatEng_OutlierMarker_All"] = int_outlierMarker_all
dt_all_eng.loc[:, "FeatEng_OutlierMarker_X0"] = int_outlierMarker_x0

In [None]:
dt_all_eng.head()

### 4.2 Sum of binary cols

#### 4.2.1 Sum of all binary cols

In [None]:
sum_binary_all = dt_all_eng[cols_raw].sum(axis = 1)

#### 4.2.2 Sum of correlation-important binary cols

In [None]:
def corBin_Contin(dt, cols, method = "pointbiserialr"):
    dt_binary_pointbiserialr = pd.DataFrame()
    for col in cols:
        if method == "spearmanr":
            cor_pb = stats.spearmanr(dt[col].values, dt.y.values)
        else:
            cor_pb = stats.pointbiserialr(dt[col].values, dt.y.values)
        dt_binary_pointbiserialr = pd.concat([dt_binary_pointbiserialr
                                             , pd.DataFrame({"Col": col
                                                            , "Cor": np.abs([cor_pb.correlation])
                                                            , "P": [cor_pb.pvalue]})])
    return dt_binary_pointbiserialr.sort_values("Cor", ascending = False)

In [None]:
dt_corBin_Cotin = corBin_Contin(dt_all_eng.loc[dt_all_eng["IsTrainTest"] == "train"], cols_raw)

In [None]:
dt_corBin_Cotin_spear = corBin_Contin(dt_all_eng.loc[dt_all_eng["IsTrainTest"] == "train"], cols_raw, method = "spearmanr")

In [None]:
cols_binary_cor_02_10 = dt_corBin_Cotin_spear.loc[dt_corBin_Cotin_spear["Cor"] >= .2]["Col"].values
cols_binary_cor_01_02 = dt_corBin_Cotin_spear.loc[(dt_corBin_Cotin_spear["Cor"] >= .1) & (dt_corBin_Cotin_spear["Cor"] < .2)]["Col"].values
cols_binary_cor_005_01 = dt_corBin_Cotin_spear.loc[(dt_corBin_Cotin_spear["Cor"] >= .05) & (dt_corBin_Cotin_spear["Cor"] < .1)]["Col"].values

In [None]:
cols_binary_important_3 = dt_corBin_Cotin["Col"].values[:3]
cols_binary_important_5 = dt_corBin_Cotin["Col"].values[:5]
cols_binary_important_10 = dt_corBin_Cotin["Col"].values[:10]
cols_binary_important_20 = dt_corBin_Cotin["Col"].values[:20]
cols_binary_important_50 = dt_corBin_Cotin["Col"].values[:50]

In [None]:
sum_binary_important_3 = dt_all_eng[cols_binary_important_3].sum(axis = 1)
sum_binary_important_5 = dt_all_eng[cols_binary_important_5].sum(axis = 1)
sum_binary_important_10 = dt_all_eng[cols_binary_important_10].sum(axis = 1)
sum_binary_important_20 = dt_all_eng[cols_binary_important_20].sum(axis = 1)
sum_binary_important_50 = dt_all_eng[cols_binary_important_50].sum(axis = 1)
sum_binary_important_cor_02_10 = dt_all_eng[cols_binary_cor_02_10].sum(axis = 1)
sum_binary_important_cor_01_02 = dt_all_eng[cols_binary_cor_01_02].sum(axis = 1)
sum_binary_important_cor_005_01 = dt_all_eng[cols_binary_cor_005_01].sum(axis = 1)

In [None]:
# SumBin_All
dt_all_eng.loc[:, "FeatEng_SumBin_All"] = sum_binary_all
# SumBin_Imp_X
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_3"] = sum_binary_important_3
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_5"] = sum_binary_important_5
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_10"] = sum_binary_important_10
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_20"] = sum_binary_important_20
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_50"] = sum_binary_important_50
dt_all_eng.loc[:, "FeatEng_SumBin_Cor_02_10"] = sum_binary_important_cor_02_10
dt_all_eng.loc[:, "FeatEng_SumBin_Cor_01_02"] = sum_binary_important_cor_01_02
dt_all_eng.loc[:, "FeatEng_SumBin_Cor_005_01"] = sum_binary_important_cor_005_01

### 4.3 Dimension reduction

In [None]:
# range
pp_range = preprocessing.MinMaxScaler()
mx_range = pp_range.fit_transform(dt_all_eng.drop(["ID", "y", "IsTrainTest"], axis = 1))
dt_all_eng = pd.DataFrame(mx_range, columns = dt_all_eng.drop(["ID", "y", "IsTrainTest"], axis = 1).columns.values)
dt_all_eng.shape

In [None]:
def featEng_dimRed(method, dt, n_component, cols, name_feature):
    if method == "PCA":
        dr = decomposition.PCA(n_components = n_component, random_state = 420)
    elif method == "ICA":
        dr = decomposition.FastICA(n_components = n_component, random_state = 420)
    elif method == "SVD":
        dr = decomposition.TruncatedSVD(n_components = n_component, random_state = 420)
    elif method == "FA":
        dr = decomposition.FactorAnalysis(n_components = n_component, random_state = 420)
    elif method == "TSNE":
        dr = manifold.TSNE(n_components = n_component, random_state = 420)
    elif method == "GRP":
        dr = random_projection.GaussianRandomProjection(n_components = n_component, eps = 0.1, random_state = 420)
    elif method == "SRP":
        dr = random_projection.SparseRandomProjection(n_components = n_component, dense_output = True, random_state = 420)
    mx = dr.fit_transform(dt[cols])
    dt = dt.join(pd.DataFrame(mx, columns = ["DR_" + method + "_" + name_feature + "_" + str(i) for i in range(1, n_component + 1)]))
    
    return dt

#### 4.3.1 PCA

##### 4.3.1.1 Raw binary cols

In [None]:
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

##### 4.3.1.2 Encoded cat cols

In [None]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

##### 4.3.1.3 Feature engineed cols

In [None]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

##### 4.3.1.4 All cols

In [None]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

#### 4.3.2 ICA

##### 4.3.2.1 Raw binary cols

In [None]:
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

##### 4.3.2.2 Encoded cat cols

In [None]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

##### 4.3.2.3 Feature engineed cols

In [None]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

##### 4.3.2.4 All cols

In [None]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

#### 4.3.3 SVD

##### 4.3.3.1 Raw binary cols

In [None]:
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

##### 4.3.3.2 Encoded cat cols

In [None]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

##### 4.3.3.3 Feature engineed cols

In [None]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

##### 4.3.3.4 All cols

In [None]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

#### 4.3.4 FA

##### 4.3.4.1 Raw binary cols

In [None]:
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

##### 4.3.4.2 Encoded cat cols

In [None]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

##### 4.3.4.3 Feature engineed cols

In [None]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

##### 4.3.4.4 All cols

In [None]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

#### 4.3.5 tsne

##### 4.3.5.1 Raw binary cols

In [None]:
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_raw, "Raw_Bin")
dt_all_eng.shape

##### 4.3.5.2 Encoded cat cols

In [None]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_encode, "Encoded_Cat")
dt_all_eng.shape

##### 4.3.5.3 Feature engineed cols

In [None]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_featEng, "FeatEng")
dt_all_eng.shape

##### 4.3.5.4 All cols

In [None]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_all, "All")
dt_all_eng.shape

#### 4.3.6 GRP

##### 4.3.6.1 Raw binary cols

In [None]:
dt_all_eng = featEng_dimRed("GRP", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

##### 4.3.6.2 Encoded cat cols

In [None]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("GRP", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

##### 4.3.6.3 Feature engineed cols

In [None]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("GRP", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

##### 4.3.6.4 All cols

In [None]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("GRP", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

#### 4.3.7 SRP

##### 4.3.7.1 Raw binary cols

In [None]:
dt_all_eng = featEng_dimRed("SRP", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

##### 4.3.7.2 Encoded cat cols

In [None]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("SRP", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

##### 4.3.7.3 Feature engineed cols

In [None]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("SRP", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

##### 4.3.7.4 All cols

In [None]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("SRP", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

### 4.5 Feature Interaction

In [None]:
dt_interest = dt_all_eng[cols_raw]
dt_fi_2way = pd.DataFrame()
seen = []
for c1 in cols_raw:
    for c2 in cols_raw:
        if c1 != c2:
            seen.append((c1, c2))
            if (c2, c1) not in seen:
                isTrain = dt_all_cleaned["IsTrainTest"] == "train"
                val_c1 = dt_interest[c1].values.astype("int64")
                val_c2 = dt_interest[c2].values.astype("int64")
                y_train = dt_all_cleaned.y.values[isTrain]
                # XOR
                XOR = val_c1 ^ val_c2
                cor_c1 = abs(stats.spearmanr(val_c1[isTrain], y_train).correlation)
                cor_c2 = abs(stats.spearmanr(val_c2[isTrain], y_train).correlation)
                cor_XOR = abs(stats.spearmanr(XOR[isTrain], y_train).correlation)
                # constant
                if cor_XOR > .3:
                    const = 1.1
                elif (cor_XOR > .2) & (cor_XOR <= .3):
                    const = 1.3
                else:
                    const = 1.5
                if cor_XOR > max(cor_c1, cor_c2) * const:
                    dt_fi_2way.loc[:, "FeatEng_FI_2way_XOR_" + c1 + "_" + c2] = XOR

In [None]:
dt_fi_2way.head()

In [None]:
dt_all_eng = dt_all_eng.join(dt_fi_2way)

### 4.6 Save dt_all_eng

In [None]:
dt_all_eng = dt_all_eng.join(dt_all_cleaned[["ID", "y", "IsTrainTest"]])

In [None]:
dt_all_eng.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_eng.csv", index = False)