In [1]:
import pandas as pd
import pickle
import numpy as np
from scipy import stats
from sklearn import preprocessing, decomposition, manifold
from sklearn import random_projection
import operator

## 4. Feature Engineering

In [2]:
# read dt_all_cleaned
dt_all_cleaned = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_cleaned.csv")
dt_all_cleaned.shape

(8418, 342)

In [3]:
# read dt_all_eng
dt_all_eng = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_encoded.csv")
dt_all_eng.shape

(8418, 608)

In [4]:
# read cols_raw
cols_raw = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/cols_raw.pkl", "rb"))
len(cols_raw)

328

### 4.1 Outlier marker

In [5]:
def getOutlierMarker(dt, index_outlier = 883):
    # outlier
    dt_outlier = dt.loc[dt.index.values == index_outlier]
    # calc int_outlierMarker
    int_outlierMarker = np.zeros(dt.shape[0])
    for col in dt_outlier.columns.values:
        for val in dt_outlier[col].values:
            int_outlierMarker = int_outlierMarker + (dt[col].values == val).astype("int64")
    
    return int_outlierMarker

#### 4.1.1 OutlierMarker_Cat

In [6]:
cols_cat = dt_all_cleaned.drop(["ID", "y", "IsTrainTest"], axis = 1).select_dtypes(include = ['object']).columns.values
cols_cat

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype=object)

In [7]:
int_outlierMarker_cat = getOutlierMarker(dt_all_cleaned[cols_cat])

#### 4.1.2 OutlierMarker_Int

In [8]:
cols_int = dt_all_cleaned.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"], axis = 1).select_dtypes(include = ['int64']).columns.values
cols_int[:10]

array(['X10', 'single_train_dup_train_X11', 'X12', 'X13', 'X14',
       'dup_test_X15', 'X16', 'dup_train_X17', 'X18', 'X19'], dtype=object)

In [9]:
int_outlierMarker_int = getOutlierMarker(dt_all_cleaned[cols_int])

#### 4.1.3 OutlierMarker_All

In [10]:
cols_all = dt_all_cleaned.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"], axis = 1).columns.values
cols_all[:10]

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       'single_train_dup_train_X11'], dtype=object)

In [11]:
int_outlierMarker_all = getOutlierMarker(dt_all_cleaned[cols_all])

#### 4.1.4 OutlierMarker_X0

In [12]:
cols_X0 = "X0"
int_outlierMarker_x0 = getOutlierMarker(dt_all_cleaned[cols_X0].to_frame())

In [13]:
dt_all_eng.loc[:, "FeatEng_OutlierMarker_Cat"] = int_outlierMarker_cat
dt_all_eng.loc[:, "FeatEng_OutlierMarker_Int"] = int_outlierMarker_int
dt_all_eng.loc[:, "FeatEng_OutlierMarker_All"] = int_outlierMarker_all
dt_all_eng.loc[:, "FeatEng_OutlierMarker_X0"] = int_outlierMarker_x0

In [14]:
dt_all_eng.head()

Unnamed: 0,ID,y,X10,single_train_dup_train_X11,X12,X13,X14,dup_test_X15,X16,dup_train_X17,...,Encode_BinaryX2,Encode_BinaryX3,Encode_BinaryX4,Encode_BinaryX5,Encode_BinaryX6,Encode_BinaryX8,FeatEng_OutlierMarker_Cat,FeatEng_OutlierMarker_Int,FeatEng_OutlierMarker_All,FeatEng_OutlierMarker_X0
0,0,130.81,0,0,0,1,0,0,0,0,...,6,1,1,11,10,18,1.0,271.0,272.0,0.0
1,6,88.53,0,0,0,0,0,0,0,0,...,44,4,1,8,2,18,2.0,292.0,294.0,0.0
2,7,76.26,0,0,0,0,0,0,0,1,...,0,6,1,3,10,3,1.0,266.0,267.0,0.0
3,9,80.62,0,0,0,0,0,0,0,0,...,0,5,1,3,2,1,3.0,271.0,274.0,0.0
4,13,78.02,0,0,0,0,0,0,0,0,...,0,5,1,31,0,21,2.0,275.0,277.0,0.0


### 4.2 Sum of binary cols

#### 4.2.1 Sum of all binary cols

In [15]:
sum_binary_all = dt_all_eng[cols_raw].sum(axis = 1)

#### 4.2.2 Sum of correlation-important binary cols

In [16]:
def corBin_Contin(dt, cols, method = "pointbiserialr"):
    dt_binary_pointbiserialr = pd.DataFrame()
    for col in cols:
        if method == "spearmanr":
            cor_pb = stats.spearmanr(dt[col].values, dt.y.values)
        else:
            cor_pb = stats.pointbiserialr(dt[col].values, dt.y.values)
        dt_binary_pointbiserialr = pd.concat([dt_binary_pointbiserialr
                                             , pd.DataFrame({"Col": col
                                                            , "Cor": np.abs([cor_pb.correlation])
                                                            , "P": [cor_pb.pvalue]})])
    return dt_binary_pointbiserialr.sort_values("Cor", ascending = False)

In [17]:
dt_corBin_Cotin = corBin_Contin(dt_all_eng.loc[dt_all_eng["IsTrainTest"] == "train"], cols_raw)

  r = r_num / r_den
  # Remove the CWD from sys.path while we load stuff.


In [18]:
dt_corBin_Cotin_spear = corBin_Contin(dt_all_eng.loc[dt_all_eng["IsTrainTest"] == "train"], cols_raw, method = "spearmanr")

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  # Remove the CWD from sys.path while we load stuff.


In [19]:
cols_binary_cor_02_10 = dt_corBin_Cotin_spear.loc[dt_corBin_Cotin_spear["Cor"] >= .2]["Col"].values
cols_binary_cor_01_02 = dt_corBin_Cotin_spear.loc[(dt_corBin_Cotin_spear["Cor"] >= .1) & (dt_corBin_Cotin_spear["Cor"] < .2)]["Col"].values
cols_binary_cor_005_01 = dt_corBin_Cotin_spear.loc[(dt_corBin_Cotin_spear["Cor"] >= .05) & (dt_corBin_Cotin_spear["Cor"] < .1)]["Col"].values

In [20]:
cols_binary_important_3 = dt_corBin_Cotin["Col"].values[:3]
cols_binary_important_5 = dt_corBin_Cotin["Col"].values[:5]
cols_binary_important_10 = dt_corBin_Cotin["Col"].values[:10]
cols_binary_important_20 = dt_corBin_Cotin["Col"].values[:20]
cols_binary_important_50 = dt_corBin_Cotin["Col"].values[:50]

In [21]:
sum_binary_important_3 = dt_all_eng[cols_binary_important_3].sum(axis = 1)
sum_binary_important_5 = dt_all_eng[cols_binary_important_5].sum(axis = 1)
sum_binary_important_10 = dt_all_eng[cols_binary_important_10].sum(axis = 1)
sum_binary_important_20 = dt_all_eng[cols_binary_important_20].sum(axis = 1)
sum_binary_important_50 = dt_all_eng[cols_binary_important_50].sum(axis = 1)
sum_binary_important_cor_02_10 = dt_all_eng[cols_binary_cor_02_10].sum(axis = 1)
sum_binary_important_cor_01_02 = dt_all_eng[cols_binary_cor_01_02].sum(axis = 1)
sum_binary_important_cor_005_01 = dt_all_eng[cols_binary_cor_005_01].sum(axis = 1)

In [22]:
# SumBin_All
dt_all_eng.loc[:, "FeatEng_SumBin_All"] = sum_binary_all
# SumBin_Imp_X
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_3"] = sum_binary_important_3
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_5"] = sum_binary_important_5
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_10"] = sum_binary_important_10
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_20"] = sum_binary_important_20
dt_all_eng.loc[:, "FeatEng_SumBin_Imp_50"] = sum_binary_important_50
dt_all_eng.loc[:, "FeatEng_SumBin_Cor_02_10"] = sum_binary_important_cor_02_10
dt_all_eng.loc[:, "FeatEng_SumBin_Cor_01_02"] = sum_binary_important_cor_01_02
dt_all_eng.loc[:, "FeatEng_SumBin_Cor_005_01"] = sum_binary_important_cor_005_01

### 4.3 Dimension reduction

In [23]:
# range
pp_range = preprocessing.MinMaxScaler()
mx_range = pp_range.fit_transform(dt_all_eng.drop(["ID", "y", "IsTrainTest"], axis = 1))
dt_all_eng = pd.DataFrame(mx_range, columns = dt_all_eng.drop(["ID", "y", "IsTrainTest"], axis = 1).columns.values)
dt_all_eng.shape

(8418, 618)

In [24]:
def featEng_dimRed(method, dt, n_component, cols, name_feature):
    if method == "PCA":
        dr = decomposition.PCA(n_components = n_component, random_state = 420)
    elif method == "ICA":
        dr = decomposition.FastICA(n_components = n_component, random_state = 420)
    elif method == "SVD":
        dr = decomposition.TruncatedSVD(n_components = n_component, random_state = 420)
    elif method == "FA":
        dr = decomposition.FactorAnalysis(n_components = n_component, random_state = 420)
    elif method == "TSNE":
        dr = manifold.TSNE(n_components = n_component, random_state = 420)
    elif method == "GRP":
        dr = random_projection.GaussianRandomProjection(n_components = n_component, eps = 0.1, random_state = 420)
    elif method == "SRP":
        dr = random_projection.SparseRandomProjection(n_components = n_component, dense_output = True, random_state = 420)
    mx = dr.fit_transform(dt[cols])
    dt = dt.join(pd.DataFrame(mx, columns = ["DR_" + method + "_" + name_feature + "_" + str(i) for i in range(1, n_component + 1)]))
    
    return dt

#### 4.3.1 PCA

##### 4.3.1.1 Raw binary cols

In [25]:
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 628)

##### 4.3.1.2 Encoded cat cols

In [26]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 638)

##### 4.3.1.3 Feature engineed cols

In [27]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 641)

##### 4.3.1.4 All cols

In [28]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

(8418, 656)

#### 4.3.2 ICA

##### 4.3.2.1 Raw binary cols

In [29]:
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape



(8418, 666)

##### 4.3.2.2 Encoded cat cols

In [30]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 676)

##### 4.3.2.3 Feature engineed cols

In [31]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 679)

##### 4.3.2.4 All cols

In [32]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

(8418, 694)

#### 4.3.3 SVD

##### 4.3.3.1 Raw binary cols

In [33]:
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 704)

##### 4.3.3.2 Encoded cat cols

In [34]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 714)

##### 4.3.3.3 Feature engineed cols

In [35]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 717)

##### 4.3.3.4 All cols

In [36]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("SVD", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

(8418, 732)

#### 4.3.4 FA

##### 4.3.4.1 Raw binary cols

In [37]:
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 742)

##### 4.3.4.2 Encoded cat cols

In [38]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 752)

##### 4.3.4.3 Feature engineed cols

In [39]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 755)

##### 4.3.4.4 All cols

In [40]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("FA", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

(8418, 770)

#### 4.3.5 tsne

##### 4.3.5.1 Raw binary cols

In [41]:
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 772)

##### 4.3.5.2 Encoded cat cols

In [42]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 774)

##### 4.3.5.3 Feature engineed cols

In [43]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 776)

##### 4.3.5.4 All cols

In [44]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("TSNE", dt_all_eng, 2, cols_all, "All")
dt_all_eng.shape

(8418, 778)

#### 4.3.6 GRP

##### 4.3.6.1 Raw binary cols

In [45]:
dt_all_eng = featEng_dimRed("GRP", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 788)

##### 4.3.6.2 Encoded cat cols

In [46]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("GRP", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 798)

##### 4.3.6.3 Feature engineed cols

In [47]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("GRP", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 801)

##### 4.3.6.4 All cols

In [48]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("GRP", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

(8418, 816)

#### 4.3.7 SRP

##### 4.3.7.1 Raw binary cols

In [49]:
dt_all_eng = featEng_dimRed("SRP", dt_all_eng, 10, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 826)

##### 4.3.7.2 Encoded cat cols

In [50]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("SRP", dt_all_eng, 10, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 836)

##### 4.3.7.3 Feature engineed cols

In [51]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("SRP", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 839)

##### 4.3.7.4 All cols

In [52]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("SRP", dt_all_eng, 15, cols_all, "All")
dt_all_eng.shape

(8418, 854)

### 4.5 Feature Interaction

In [125]:
dt_interest = dt_all_eng[cols_raw]
dt_fi_2way = pd.DataFrame()
seen = []
for c1 in cols_raw:
    for c2 in cols_raw:
        if c1 != c2:
            seen.append((c1, c2))
            if (c2, c1) not in seen:
                isTrain = dt_all_cleaned["IsTrainTest"] == "train"
                val_c1 = dt_interest[c1].values.astype("int64")
                val_c2 = dt_interest[c2].values.astype("int64")
                y_train = dt_all_cleaned.y.values[isTrain]
                # XOR
                XOR = val_c1 ^ val_c2
                cor_c1 = abs(stats.spearmanr(val_c1[isTrain], y_train).correlation)
                cor_c2 = abs(stats.spearmanr(val_c2[isTrain], y_train).correlation)
                cor_XOR = abs(stats.spearmanr(XOR[isTrain], y_train).correlation)
                # constant
                if cor_XOR > .3:
                    const = 1.1
                elif (cor_XOR > .2) & (cor_XOR <= .3):
                    const = 1.3
                else:
                    const = 1.5
                if cor_XOR > max(cor_c1, cor_c2) * const:
                    dt_fi_2way.loc[:, "FeatEng_FI_2way_XOR_" + c1 + "_" + c2] = XOR

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [126]:
dt_fi_2way.head()

Unnamed: 0,FeatEng_FI_2way_XOR_X12_X178,FeatEng_FI_2way_XOR_X12_X250,FeatEng_FI_2way_XOR_X13_X351,FeatEng_FI_2way_XOR_X14_X66,FeatEng_FI_2way_XOR_X14_X80,FeatEng_FI_2way_XOR_X14_X98,FeatEng_FI_2way_XOR_X14_X101,FeatEng_FI_2way_XOR_X14_X126,FeatEng_FI_2way_XOR_X14_X128,FeatEng_FI_2way_XOR_X14_X179,...,FeatEng_FI_2way_XOR_X350_X375,FeatEng_FI_2way_XOR_X351_X377,FeatEng_FI_2way_XOR_X356_X377,FeatEng_FI_2way_XOR_X358_X359,FeatEng_FI_2way_XOR_X358_X362,FeatEng_FI_2way_XOR_X358_X375,FeatEng_FI_2way_XOR_X359_X374,FeatEng_FI_2way_XOR_X366_X374,FeatEng_FI_2way_XOR_X366_X384,FeatEng_FI_2way_XOR_X371_X378
0,0,0,1,0,0,0,0,0,1,1,...,0,1,1,0,0,0,0,0,0,0
1,1,1,0,0,1,1,1,0,1,0,...,1,0,0,0,0,1,0,0,0,0
2,0,1,0,0,1,1,1,0,1,1,...,1,0,0,1,1,1,0,0,0,0
3,0,1,0,0,1,1,1,0,1,1,...,1,0,0,1,1,1,0,0,0,0
4,0,1,0,0,1,1,1,0,1,1,...,1,0,0,1,1,1,0,0,0,1


In [129]:
dt_all_eng = dt_all_eng.join(dt_fi_2way)

### 4.6 Save dt_all_eng

In [None]:
dt_all_eng = dt_all_eng.join(dt_all_cleaned[["ID", "y", "IsTrainTest"]])

In [130]:
dt_all_eng.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_eng.csv", index = False)