In [1]:
import pandas as pd
import numpy as np
import math
import pickle
import category_encoders as ce
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model, decomposition
from sklearn.metrics import r2_score

## 1. Load

In [2]:
# load data
dt_train_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/train.csv")
dt_test_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/test.csv")

In [3]:
print(dt_train_raw.shape, dt_test_raw.shape)

(4209, 378) (4209, 377)


## 2. Transform

In [4]:
# marke train and test set
dt_train_raw.loc[:, "IsTrainTest"] = pd.Series("train", index = dt_train_raw.index)
dt_test_raw.loc[:, "IsTrainTest"] = pd.Series("test", index = dt_test_raw.index)

# change test index and add y
dt_test_raw.index = dt_test_raw.index + max(dt_train_raw.index) + 1
dt_test_raw.loc[:,"y"] = pd.Series([0.0] * dt_test_raw.shape[0], index = dt_test_raw.index)

# concat
dt_test_raw = dt_test_raw[dt_train_raw.columns.values]
dt_all_raw = pd.concat([dt_train_raw, dt_test_raw])

In [5]:
dt_all_raw.shape

(8418, 379)

## 3. Preprocess

### 3.1. Duplicated cols

#### 3.1.1 Drop dup cols in dt_all

In [6]:
# duplicated cols in dt_all
cols_dup_all_toDrop = dt_all_raw.T.duplicated()[dt_all_raw.T.duplicated() == True].index.values
dt_all_raw = dt_all_raw.drop(cols_dup_all_toDrop, axis = 1)

In [7]:
dt_all_raw.shape

(8418, 343)

#### 3.1.2 Rename the remaining dup cols

In [8]:
# duplicated cols in dt_train
cols_dup_train = dt_train_raw.T.duplicated(keep = False)[dt_train_raw.T.duplicated(keep = False) == True].index.values
# duplicated cols in dt_test
cols_dup_test = dt_test_raw.T.duplicated(keep = False)[dt_test_raw.T.duplicated(keep = False) == True].index.values

In [9]:
# change col names for cols_dup_train and cols_dup_test
dict_dup_train = {x: "dup_train_" + x for x in list(cols_dup_train)}
dt_all_raw = dt_all_raw.rename(columns = dict_dup_train)
dict_dup_test = {x: "dup_test_" + x for x in list(cols_dup_test[cols_dup_test != "y"])}
dt_all_raw = dt_all_raw.rename(columns = dict_dup_test)

In [10]:
dt_all_raw.shape

(8418, 343)

### 3.2 Duplicated rows

In [11]:
# cols_cat
cols_cat = dt_all_raw.drop("IsTrainTest", axis = 1).select_dtypes(include = ['object']).columns.values
# cols_int
cols_int = dt_all_raw.drop("ID", axis = 1).select_dtypes(include = ['int64']).columns

In [12]:
dt_all_raw.loc[:, "IsDupRow_All"] = dt_all_raw.drop(["ID", "y"], axis = 1).duplicated(keep = False).astype("int64")
dt_all_raw.loc[:, "IsDupRow_Cat"] = dt_all_raw.drop(["ID", "y"], axis = 1)[cols_cat].duplicated(keep = False).astype("int64")
dt_all_raw.loc[:, "IsDupRow_Int"] = dt_all_raw.drop(["ID", "y"], axis = 1)[cols_int].duplicated(keep = False).astype("int64")

In [13]:
dt_all_raw.shape

(8418, 346)

### 3.2 Remove single values

In [14]:
# single value cols in dt_train
cols_single_train = []
for col in dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "train"].drop(["y", "IsTrainTest"], axis = 1).columns.values:
    len_unique = len(np.unique(dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "train"][col].values))
    if len_unique == 1:
        cols_single_train.append(col)
# single value cols in dt_test
cols_single_test = []
for col in dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "test"].drop(["y", "IsTrainTest"], axis = 1).columns.values:
    len_unique = len(np.unique(dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "test"][col].values))
    if len_unique == 1:
        cols_single_test.append(col)

In [15]:
# change col names for cols_single_train and cols_single_test
dict_single_train = {x: "single_train_" + x for x in cols_single_train}
dt_all_raw = dt_all_raw.rename(columns = dict_single_train)
dict_single_test = {x: "single_test_" + x for x in cols_single_test}
dt_all_raw = dt_all_raw.rename(columns = dict_single_test)

In [16]:
dt_all_raw.shape

(8418, 346)

### 3.3 Remove complimentary cols

In [17]:
cols_int = dt_all_raw.drop("ID", axis = 1).select_dtypes(include = ['int64']).columns

In [18]:
def removeCompCols(dt, cols):
    seen = []
    col2s = []
    nrow = dt.shape[0]
    for col1 in cols_int:
        for col2 in cols_int:
            compliment = sum(dt[col1].values + dt[col2].values)
            same = np.sum(dt[col1] == dt[col2])
            if (compliment == nrow) & (same == 0):
                seen.append((col1, col2))
                if (col2, col1) not in seen:
                    col2s.append(col2)
                    print(col1, col2)
    return col2s

In [19]:
cols_comp = removeCompCols(dt_all_raw, cols_int)

X128 X130
X156 X157
X204 X205
dup_train_X232 dup_test_X263


In [20]:
dt_all_raw = dt_all_raw.drop(cols_comp, axis = 1)

In [21]:
dt_all_raw.shape

(8418, 342)

### 3.4 Save cols_raw

In [22]:
# cols_raw
cols_raw = dt_all_raw.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"
                            , "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"], axis = 1).columns.values
pickle.dump(cols_raw, open( "../../data/Mercedes_Benz_Greener_Manufacturing/data/cols_raw.pkl", "wb"))

### 3.5 Encode cat cols

In [23]:
# cols_cat
cols_cat = dt_all_raw.drop("IsTrainTest", axis = 1).select_dtypes(include = ['object']).columns.values

#### 3.3.1 One-Hot Encoding

In [24]:
dt_cat_onehot = pd.get_dummies(dt_all_raw[cols_cat])
dict_ohe = {x: "Encode_ohe_" + x for x in dt_cat_onehot.columns.values}
dt_cat_onehot = dt_cat_onehot.rename(columns = dict_ohe)

#### 3.3.2 TargetMean Encoding

In [25]:
# oof to encode cols_cat with TargetMean
def getTargetMean(dt, dt_all, cols, k = 3):

    # init dt_targetMean
    dt_targetMean = pd.DataFrame()

    for col in cols:
    
        # init dt_targetMean_oof
        dt_targetMean_oof = pd.DataFrame()
        
        # X_targetMean_Kfold, y_targetMean_Kfold
        X_targetMean_Kfold = dt[[col, "y"]]
        y_targetMean_Kfold = dt[col].values

        # oof cv
        skf = StratifiedKFold(n_splits = k)
        
        
        for i, (ind_in, ind_out) in enumerate(skf.split(X_targetMean_Kfold, y_targetMean_Kfold)):


            # init dt_targetMean_oof
            dt_targetMean_val = pd.DataFrame()

            # X_in, X_out, y_in, y_out
            X_in, X_out = X_targetMean_Kfold.iloc[ind_in], X_targetMean_Kfold.iloc[ind_out]
            y_in, y_out = y_targetMean_Kfold[ind_in], y_targetMean_Kfold[ind_out]

            # calc TargetMean
            for val in set(X_in[col].values):
                dt_targetMean_temp = pd.DataFrame({"Value": [val]
                                                   , "TargetMean_" + str(i): [X_in.loc[X_in[col] == val].y.mean()]})
                dt_targetMean_val = pd.concat([dt_targetMean_val, dt_targetMean_temp])

            # merge with oof
            if i == 0:
                dt_targetMean_oof = pd.merge(X_targetMean_Kfold.drop("y", axis = 1).drop_duplicates(), dt_targetMean_val
                                             , how = "left", left_on = col, right_on = "Value")
                dt_targetMean_oof = dt_targetMean_oof.drop(col, axis = 1)
            else:
                dt_targetMean_oof = pd.merge(dt_targetMean_oof, dt_targetMean_val
                                             , how = "left", on = "Value")

        # move Value to the first column
        value = dt_targetMean_oof['Value']
        dt_targetMean_oof.drop(labels = ["Value"], axis = 1,inplace = True)
        dt_targetMean_oof.insert(0, 'Value', value)
        # assign col
        dt_targetMean_oof.insert(0, 'Col', col)
        
        # concat with col
        dt_targetMean = pd.concat([dt_targetMean, dt_targetMean_oof])
    
    # mean of oof
    dt_targetMean["TargetMean"] = dt_targetMean.filter(regex = "TargetMean").mean(axis = 1)
    # fill zero
    dt_targetMean = dt_targetMean.fillna(0)
    # remove oof cols
    dt_targetMean = dt_targetMean[["Col", "Value", "TargetMean"]]

    # merge to original table
    dt_cat_cols = dt_all[cols]
    
    for col in cols_cat:
        dt_cat_cols = pd.merge(dt_cat_cols, dt_targetMean.loc[dt_targetMean["Col"] == col]
                           , how = "left", left_on = col, right_on = "Value")
        dt_cat_cols = dt_cat_cols.drop(["Value", "Col"], axis = 1)
        dt_cat_cols = dt_cat_cols.rename(columns = {"TargetMean": "Encode_TargetMean_" + col})
    # fill zero
    dt_cat_cols = dt_cat_cols.fillna(0)
    
    # only select targetMean cols
    dt_cat_targetMean = dt_cat_cols.filter(regex = "Encode_TargetMean_")
        
    return dt_cat_targetMean

In [26]:
## remove the outlier and tm
dt_cat_targetMean = getTargetMean(dt_all_raw.loc[(dt_all_raw["IsTrainTest"] == "train") & (dt_all_raw["ID"] != 1770)]
                                  , dt_all_raw
                                  , cols_cat
                                  , 3)



#### 3.3.3 Frequency Encoding

In [27]:
def getFrequency(dt, cols):
    # calc the cols freq
    dt_cat_freq = pd.DataFrame()
    for col in cols:
        dt_col_freq_temp = pd.DataFrame({"Col": col
                                         , "Value": dt[col].value_counts().index.values
                                         , "Freq": dt[col].value_counts().values})

        dt_cat_freq = pd.concat([dt_cat_freq, dt_col_freq_temp])

    # merge to origin
    dt_cat_cols = dt_all_raw[cols]
    for col in cols:
        dt_cat_cols = pd.merge(dt_cat_cols, dt_cat_freq.loc[dt_cat_freq["Col"] == col]
                           , how = "left", left_on = col, right_on = "Value")
        dt_cat_cols = dt_cat_cols.drop(["Value", "Col"], axis = 1)
        dt_cat_cols = dt_cat_cols.rename(columns = {"Freq": "Encode_Freq_" + col})

    dt_cat_cols = dt_cat_cols.filter(regex = "Encode_Freq_")
    
    return dt_cat_cols

In [28]:
dt_cat_freq = getFrequency(dt_all_raw, cols_cat)

#### 3.3.3 Binary

In [29]:
encode_binary = ce.BinaryEncoder(cols_cat)
dt_cat_binary = encode_binary.fit_transform(dt_all_raw[cols_cat])
dt_cat_binary.columns = "Encode_Binary" + dt_cat_binary.columns

  X[col] = X[col].astype(int).reshape(-1, )
  X[switch.get('col')] = X[switch.get('col')].astype(int).reshape(-1, )


#### 3.3.4 Ordinal X0

In [30]:
def getOrdinal(dt, col):
    dt_ordinal = pd.DataFrame()
    for val in set(dt[col].values):
        dt_ordinal_temp = pd.DataFrame({"Value": [val]
                                        , "Encode_Ordinal_" + col: dt.loc[dt[col] == val].y.mean()})
        dt_ordinal = pd.concat([dt_ordinal, dt_ordinal_temp])

    dt_cat_ordinal = pd.merge(dt[col].to_frame(), dt_ordinal
                              , how = "left", left_on = col, right_on = "Value")
    dt_cat_ordinal = dt_cat_ordinal.drop([col, "Value"], axis = 1)
    return dt_cat_ordinal

In [31]:
dt_cat_ordinal_X0 = getOrdinal(dt_all_raw, "X0")

#### 3.3.5 Combine all encoders

In [32]:
# OHE
dt_all_encoded = dt_all_raw.drop(cols_cat, axis = 1).join(dt_cat_onehot)
dt_all_encoded.shape

(8418, 545)

In [33]:
# TargetMean
dt_all_encoded = dt_all_encoded.join(dt_cat_targetMean)
dt_all_encoded.shape

(8418, 553)

In [34]:
# Frequency
dt_all_encoded = dt_all_encoded.join(dt_cat_freq)
dt_all_encoded.shape

(8418, 561)

In [35]:
# Binary
dt_all_encoded = dt_all_encoded.join(dt_cat_binary)
dt_all_encoded.shape

(8418, 598)

In [36]:
# Ordinal X0
dt_all_encoded = dt_all_encoded.join(dt_cat_ordinal_X0)
dt_all_encoded.shape

#### 3.3.5 Save dt_all_raw, dt_all_encoded

In [38]:
dt_all_encoded.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_encoded.csv", index = False)
dt_all_raw.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_raw.csv", index = False)

## 4. Feature Engineering

In [54]:
# read dt_all_raw
dt_all_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_raw.csv")
dt_all_raw.shape

(8418, 342)

In [55]:
# read dt_all_encoded
dt_all_encoded = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_encoded.csv")
dt_all_encoded.shape

(8418, 599)

In [56]:
# read cols_raw
cols_raw = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/cols_raw.pkl", "rb"))
len(cols_raw)

328

### 4.1 Outlier marker

In [57]:
def getOutlierMarker(dt, index_outlier = 883):
    # outlier
    dt_outlier = dt.loc[dt.index.values == index_outlier]
    # calc int_outlierMarker
    int_outlierMarker = np.zeros(dt.shape[0])
    for col in dt_outlier.columns.values:
        for val in dt_outlier[col].values:
            int_outlierMarker = int_outlierMarker + (dt[col].values == val).astype("int64")
    
    return int_outlierMarker

#### 4.1.1 OutlierMarker_Cat

In [58]:
cols_cat = dt_all_raw.drop(["ID", "y", "IsTrainTest"], axis = 1).select_dtypes(include = ['object']).columns.values
cols_cat

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype=object)

In [59]:
int_outlierMarker_cat = getOutlierMarker(dt_all_raw[cols_cat])

#### 4.1.2 OutlierMarker_Int

In [60]:
cols_int = dt_all_raw.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"], axis = 1).select_dtypes(include = ['int64']).columns.values
cols_int[:10]

array(['X10', 'single_train_dup_train_X11', 'X12', 'X13', 'X14',
       'dup_test_X15', 'X16', 'dup_train_X17', 'X18', 'X19'], dtype=object)

In [61]:
int_outlierMarker_int = getOutlierMarker(dt_all_raw[cols_int])

#### 4.1.3 OutlierMarker_All

In [62]:
cols_all = dt_all_raw.drop(["ID", "y", "IsTrainTest", "IsDupRow_All", "IsDupRow_Cat", "IsDupRow_Int"], axis = 1).columns.values
cols_all[:10]

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       'single_train_dup_train_X11'], dtype=object)

In [63]:
int_outlierMarker_all = getOutlierMarker(dt_all_raw[cols_all])

#### 4.1.4 OutlierMarker_X0

In [64]:
cols_X0 = "X0"
int_outlierMarker_x0 = getOutlierMarker(dt_all_raw[cols_X0].to_frame())

In [65]:
dt_all_encoded.loc[:, "FeatEng_OutlierMarker_Cat"] = int_outlierMarker_cat
dt_all_encoded.loc[:, "FeatEng_OutlierMarker_Int"] = int_outlierMarker_int
dt_all_encoded.loc[:, "FeatEng_OutlierMarker_All"] = int_outlierMarker_all
dt_all_encoded.loc[:, "FeatEng_OutlierMarker_X0"] = int_outlierMarker_x0

In [66]:
dt_all_encoded.head()

Unnamed: 0,ID,y,X10,single_train_dup_train_X11,X12,X13,X14,dup_test_X15,X16,dup_train_X17,...,Encode_BinaryX8_0,Encode_BinaryX8_1,Encode_BinaryX8_2,Encode_BinaryX8_3,Encode_BinaryX8_4,Encode_Ordinal_X0,FeatEng_OutlierMarker_Cat,FeatEng_OutlierMarker_Int,FeatEng_OutlierMarker_All,FeatEng_OutlierMarker_X0
0,0,130.81,0,0,0,1,0,0,0,0,...,0,0,0,1,0,47.583043,1.0,271.0,272.0,0.0
1,6,88.53,0,0,0,0,0,0,0,0,...,0,0,0,1,0,47.583043,2.0,292.0,294.0,0.0
2,7,76.26,0,0,0,0,0,0,0,1,...,0,0,1,0,1,40.638304,1.0,266.0,267.0,0.0
3,9,80.62,0,0,0,0,0,0,0,0,...,0,0,1,1,0,40.638304,3.0,271.0,274.0,0.0
4,13,78.02,0,0,0,0,0,0,0,0,...,1,0,0,0,0,40.638304,2.0,275.0,277.0,0.0


### 4.2 Sum of binary cols

#### 4.2.1 Sum of all binary cols

In [67]:
sum_binary_all = dt_all_encoded[cols_raw].sum(axis = 1)

#### 4.2.2 Sum of correlation-important binary cols

In [68]:
def corBin_Contin(dt, cols):
    dt_binary_pointbiserialr = pd.DataFrame()
    for col in cols:
        cor_pb = stats.pointbiserialr(dt[col].values, dt.y.values)
        dt_binary_pointbiserialr = pd.concat([dt_binary_pointbiserialr
                                             , pd.DataFrame({"Col": col
                                                            , "Cor": np.abs([cor_pb.correlation])
                                                            , "P": [cor_pb.pvalue]})])
    return dt_binary_pointbiserialr.sort_values("Cor", ascending = False)

In [69]:
dt_corBin_Cotin = corBin_Contin(dt_all_encoded, cols_raw)

In [70]:
cols_binary_important_3 = dt_corBin_Cotin["Col"].values[:3]
cols_binary_important_5 = dt_corBin_Cotin["Col"].values[:5]
cols_binary_important_10 = dt_corBin_Cotin["Col"].values[:10]
cols_binary_important_20 = dt_corBin_Cotin["Col"].values[:20]
cols_binary_important_50 = dt_corBin_Cotin["Col"].values[:50]

In [71]:
sum_binary_important_3 = dt_all_encoded[cols_binary_important_3].sum(axis = 1)
sum_binary_important_5 = dt_all_encoded[cols_binary_important_5].sum(axis = 1)
sum_binary_important_10 = dt_all_encoded[cols_binary_important_10].sum(axis = 1)
sum_binary_important_20 = dt_all_encoded[cols_binary_important_20].sum(axis = 1)
sum_binary_important_50 = dt_all_encoded[cols_binary_important_50].sum(axis = 1)

In [72]:
# SumBin_All
dt_all_encoded.loc[:, "FeatEng_SumBin_All"] = sum_binary_all
# SumBin_Imp_X
dt_all_encoded.loc[:, "FeatEng_SumBin_Imp_3"] = sum_binary_important_3
dt_all_encoded.loc[:, "FeatEng_SumBin_Imp_5"] = sum_binary_important_5
dt_all_encoded.loc[:, "FeatEng_SumBin_Imp_10"] = sum_binary_important_10
dt_all_encoded.loc[:, "FeatEng_SumBin_Imp_20"] = sum_binary_important_20
dt_all_encoded.loc[:, "FeatEng_SumBin_Imp_50"] = sum_binary_important_50

### 4.3 Dimension reduction

In [106]:
# range
pp_range = preprocessing.MinMaxScaler()
mx_range = pp_range.fit_transform(dt_all_encoded.drop(["ID", "y", "IsTrainTest"], axis = 1))
dt_all_eng = pd.DataFrame(mx_range, columns = dt_all_encoded.drop(["ID", "y", "IsTrainTest"], axis = 1).columns.values)
dt_all_eng.shape

(8418, 606)

In [107]:
def featEng_dimRed(method, dt, n_component, cols, name_feature):
    if method == "PCA":
        dr = decomposition.PCA(n_components = n_component, random_state = 888)
    elif method == "ICA":
        dr = decomposition.FastICA(n_components = n_component, random_state = 888)
    elif method == "SVD":
        dr = decomposition.TruncatedSVD(n_components = n_component, random_state = 888)
    elif method == "FA":
        dr = decomposition.FactorAnalysis(n_components = n_component, random_state = 888)
    mx = dr.fit_transform(dt[cols])
    dt = dt.join(pd.DataFrame(mx, columns = ["DR_" + method + "_" + name_feature + "_" + str(i) for i in range(1, n_component + 1)]))
    
    return dt

#### 4.3.1 PCA

##### 4.3.1.1 Raw binary cols

In [108]:
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 15, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 621)

##### 4.3.1.2 Encoded cat cols

In [111]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 15, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 636)

##### 4.3.1.3 Feature engineed cols

In [112]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 639)

##### 4.3.1.4 All cols

In [148]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("PCA", dt_all_eng, 20, cols_all, "All")
dt_all_eng.shape

(8418, 659)

#### 4.3.2 ICA

##### 4.3.2.1 Raw binary cols

In [149]:
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 15, cols_raw, "Raw_Bin")
dt_all_eng.shape

(8418, 674)

##### 4.3.2.2 Encoded cat cols

In [97]:
cols_encode = dt_all_eng.filter(regex = "Encode_").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 15, cols_encode, "Encoded_Cat")
dt_all_eng.shape

(8418, 689)

##### 4.3.2.3 Feature engineed cols

In [150]:
cols_featEng = dt_all_eng.filter(regex = "FeatEng_").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 3, cols_featEng, "FeatEng")
dt_all_eng.shape

(8418, 677)

##### 4.3.2.4 All cols

In [151]:
cols_all = dt_all_eng.filter(regex = "^((?!DR).)*$").columns.values
dt_all_eng = featEng_dimRed("ICA", dt_all_eng, 20, cols_all, "All")
dt_all_eng.shape

(8418, 697)

#### 4.3.3 tsne

##### 4.3.3.1 Raw binary cols

##### 4.3.3.2 Encoded cat cols

##### 4.3.3.3 Feature engineed cols

##### 4.3.3.4 All cols

### 4.4 Save dt_all_eng

In [None]:
dt_all_eng = dt_all_encoded
dt_all_eng.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_eng.csv", index = False)

## 5. Model

In [None]:
# r^2
def r_2(preds, dtrain):
    labels = dtrain.get_label()
    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)
    return 'score', r2_score(labels, preds)

### 4.1 ExtraTree

In [None]:
# extratree
params_extraTRee = {
    "n_jobs": 7
    , "n_estimators": 400
    , "max_depth": 3
    , "min_samples_split": 5
    , "random_state": 888
    , "verbose": 0
}
model_extra = ExtraTreesRegressor(**params_extraTRee)

### 4.2 xgboost

In [None]:
# params
params_xgb = {
    "eta": 0.05,
    "max_depth": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.8,
    "objective": 'reg:linear',
    "silent": 0
}
num_boost_round = 1000

### 4.3 Linear Regression

In [None]:
model_lr = linear_model.LinearRegression()

### 4.4 Ridge Regression

In [None]:
model_ridge = linear_model.Ridge()
params_ridge = {'alpha': [0,0.5,1,2,3,5]}

## 5. Cross-Validation strategy

In [None]:
# X, y, ID
X_train_all = dt_all.loc[dt_all["IsTrainTest"] == "train"].drop(["ID", "y", "IsTrainTest"], axis = 1)
y_train_all = dt_all.loc[dt_all["IsTrainTest"] == "train"].y.values
ID_train_all = dt_all.loc[dt_all["IsTrainTest"] == "train"].ID.values
print("X_train_all:", X_train_all.shape)

X_test = dt_all.loc[dt_all["IsTrainTest"] == "test"].drop(["ID", "y", "IsTrainTest"], axis = 1)
y_test = dt_all.loc[dt_all["IsTrainTest"] == "test"].y.values
ID_test = dt_all.loc[dt_all["IsTrainTest"] == "test"].ID.values
print("X_test:", X_test.shape)

dtest = xgb.DMatrix(X_test)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import randint as sp_randint

In [None]:
# extratree
params_extraTRee_randSearch = {
    "n_estimators": sp_randint(200, 1000)
    , "max_depth": sp_randint(3, 6)
    , "min_samples_split": sp_randint(3, 20)
    , "criteriion": ["gini", "entropy"]
}
model_extra_rs = ExtraTreesRegressor()

In [None]:
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 888)
rand_search = RandomizedSearchCV(model_extra_rs, params_extraTRee_randSearch
                                 , scoring = scorer, cv = 3, verbose = 1, n_jobs = 6)

In [None]:
scorer = make_scorer(r2_score)

In [None]:
rand_search.fit(X_train_all, y_train_all)

In [None]:
rand_search.score(X_train_all, y_train_all)

In [None]:
rand_search.best_params_

In [None]:
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 888)
skf.split(X_train_all, y_train_all)
presds_y = cross_val_predict(model_extra, X_train_all, bin_y, cv = skf, n_jobs = 7)

In [None]:
len(presds_y)

In [None]:
presds_y

In [None]:
r2_score(bin_y, presds_y)

### 5.1 Random split

In [None]:
score_rs_valid = 0
preds_rs_test = []
w_extra = .2
w_xgb = .5
w_ridge = .3
for i in range(0, 10):
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all
                                                          , test_size = 0.2, random_state = i)
    # extraTree
    print("extraTree ...")
    model_extra.fit(X_train, y_train)
    preds_extra_valid = model_extra.predict(X_valid)
    score_extra = r2_score(y_valid, preds_extra_valid)
    
    # xgboost
    print("xgboost ...")
    dtrain = xgb.DMatrix(X_train, label = y_train)
    dvalid = xgb.DMatrix(X_valid, label = y_valid)
    ls_watch =  [(dtrain, 'train'), (dvalid, 'eval')]
    model_xgb = xgb.train(params_xgb, dtrain, evals = ls_watch
                          , feval = r_2, maximize = True
                          , num_boost_round = num_boost_round
                          , early_stopping_rounds = 50, verbose_eval = 50)
    preds_xgb_valid = model_xgb.predict(dvalid)
    score_xgb = r2_score(y_valid, preds_xgb_valid)
    
    # ridge
    print("ridge ...")
    model_ridge = model_ridge.fit(X_train, y_train)
    preds_ridge_valid = model_ridge.predict(X_valid)
    score_ridge = r2_score(y_valid, preds_ridge_valid)

    # avg them
    ls_preds = [preds_extra_valid * w_extra, preds_xgb_valid * w_xgb, preds_ridge_valid * w_ridge]
    preds_rs_valid = [sum(e) for e in zip(*ls_preds)]
    
    score_rs = r2_score(y_valid, preds_rs_valid)
    
    score_rs_valid = score_rs_valid + score_rs / 10
    
    # test
    preds_extra_test = model_extra.predict(X_test)
    preds_xgb_test = model_xgb.predict(dtest)
    preds_ridge_test = model_ridge.predict(X_test)
    ls_preds_test = [preds_extra_test * w_extra, preds_xgb_test * w_xgb, preds_ridge_test * w_ridge]
    preds_rs_test = [sum(e) for e in zip(*ls_preds_test)]

    
    print("i: {} - extraTree:{}; xgb:{}; ridge:{}; rs_all:{}".format(i, round(score_extra, 5)
                                                                     , round(score_xgb, 5)
                                                                     , round(score_ridge, 5)
                                                                     , round(score_rs, 5)))

In [None]:
score_rs_valid

In [None]:
preds_rs_test[:10]

### 5.2 Stratified Kfold

In [None]:
bin_y = pd.qcut(y_train_all, 5, labels = [1, 2, 3, 4, 5]).astype("int64")
# stratified kfold
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 888)

In [None]:
score_skf_valid = 0
preds_skf_test = []
w_extra = .2
w_xgb = .5
w_ridge = .3
for i, (ind_train, ind_valid) in enumerate(skf.split(X_train_all, bin_y)):
    # X, y
    X_train, X_valid = X_train_all.iloc[ind_train], X_train_all.iloc[ind_valid]
    y_train, y_valid = y_train_all[ind_train], y_train_all[ind_valid]
    
    # extraTree
    print("extraTree ...")
    model_extra.fit(X_train, y_train)
    preds_extra_valid = model_extra.predict(X_valid)
    score_extra = r2_score(y_valid, preds_extra_valid)
    
    # xgboost
    print("xgboost ...")
    dtrain = xgb.DMatrix(X_train, label = y_train)
    dvalid = xgb.DMatrix(X_valid, label = y_valid)
    ls_watch =  [(dtrain, 'train'), (dvalid, 'eval')]
    model_xgb = xgb.train(params_xgb, dtrain, evals = ls_watch
                          , feval = r_2, maximize = True
                          , num_boost_round = num_boost_round
                          , early_stopping_rounds = 50, verbose_eval = 50)
    preds_xgb_valid = model_xgb.predict(dvalid)
    score_xgb = r2_score(y_valid, preds_xgb_valid)
    
    # ridge
    print("ridge ...")
    model_ridge = model_ridge.fit(X_train, y_train)
    preds_ridge_valid = model_ridge.predict(X_valid)
    score_ridge = r2_score(y_valid, preds_ridge_valid)

    # avg them
    ls_preds = ls_preds = [preds_extra_valid * w_extra, preds_xgb_valid * w_xgb, preds_ridge_valid * w_ridge]
    preds_skf_valid = [sum(e) for e in zip(*ls_preds)]
    
    score_skf = r2_score(y_valid, preds_skf_valid)
    
    score_skf_valid = score_skf_valid + score_skf / 5
    
    # test
    preds_extra_test = model_extra.predict(X_test)
    preds_xgb_test = model_xgb.predict(dtest)
    preds_ridge_test = model_ridge.predict(X_test)
    ls_preds_test = [preds_extra_test * w_extra, preds_xgb_test * w_xgb, preds_ridge_test * w_ridge]
    preds_skf_test = [sum(e) for e in zip(*ls_preds_test)]

    
    print("i: {} - extraTree:{}; xgb:{}; ridge:{}; rs_skf:{}".format(i, round(score_extra, 5)
                                                                     , round(score_xgb, 5)
                                                                     , round(score_ridge, 5)
                                                                     , round(score_skf, 5)))

In [None]:
score_skf_valid

In [None]:
preds_skf_test[:10]

## 6. Submit

In [None]:
wt_rs = .3
wt_skf = .7
ls_submit_preds_test = [np.array(preds_rs_test) * wt_rs, np.array(preds_skf_test) * wt_skf]
preds_submit_test = [sum(e) for e in zip(*ls_submit_preds_test)]
dt_submit = pd.DataFrame({"ID": ID_test, "y": preds_submit_test})

In [None]:
dt_submit.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/submission/4_initModel_ohe_tm_cvrs_cvskf_3_7_modextra_modxgb_modridge_2_5_3.csv"
                              , index = False)