In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, train_test_split
from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

## 1. Load

In [2]:
# load data
dt_train_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/train.csv")
dt_test_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/test.csv")

## 2. Transform

In [3]:
# marke train and test set
dt_train_raw.loc[:, "IsTrainTest"] = pd.Series("train", index = dt_train_raw.index)
dt_test_raw.loc[:, "IsTrainTest"] = pd.Series("test", index = dt_test_raw.index)

# change test index and add y
dt_test_raw.index = dt_test_raw.index + max(dt_train_raw.index) + 1
dt_test_raw.loc[:,"y"] = pd.Series([0.0] * dt_test_raw.shape[0], index = dt_test_raw.index)

# concat
dt_test_raw = dt_test_raw[dt_train_raw.columns.values]
dt_all_raw = pd.concat([dt_train_raw, dt_test_raw])

## 3. Preprocess

### 3.1. Remove duplicates

In [4]:
# duplicated cols in dt_all
cols_dup_all = dt_all_raw.T.duplicated()[dt_all_raw.T.duplicated() == True].index.values
# duplicated cols in dt_train
cols_dup_train = dt_train_raw.T.duplicated()[dt_train_raw.T.duplicated() == True].index.values
# duplicated cols in dt_test
cols_dup_test = dt_test_raw.T.duplicated()[dt_test_raw.T.duplicated() == True].index.values

In [5]:
# drop all cols_dup_all
dt_all_raw = dt_all_raw.drop(cols_dup_all, axis = 1)
# change col names for cols_dup_train and cols_dup_test
dict_dup_train = {x: "dup_train_" + x for x in list(set(cols_dup_train) - set(cols_dup_all))}
dt_all_raw = dt_all_raw.rename(columns = dict_dup_train)
dict_dup_test = {x: "dup_test_" + x for x in list(set(cols_dup_test) - set(cols_dup_all))}
dt_all_raw = dt_all_raw.rename(columns = dict_dup_test)

### 3.2 Remove single values

In [6]:
# single value cols in dt_train
cols_single_train = []
for col in dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "train"].drop(["y", "IsTrainTest"], axis = 1).columns.values:
    len_unique = len(np.unique(dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "train"][col].values))
    if len_unique == 1:
        cols_single_train.append(col)
# single value cols in dt_test
cols_single_test = []
for col in dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "test"].drop(["y", "IsTrainTest"], axis = 1).columns.values:
    len_unique = len(np.unique(dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "test"][col].values))
    if len_unique == 1:
        cols_single_test.append(col)

In [7]:
# change col names for cols_single_train and cols_single_test
dict_single_train = {x: "single_train_" + x for x in cols_single_train}
dt_all_raw = dt_all_raw.rename(columns = dict_single_train)
dict_single_test = {x: "single_test_" + x for x in cols_single_test}
dt_all_raw = dt_all_raw.rename(columns = dict_single_test)

### 3.3 Encode cat cols

In [8]:
# cols_cat
cols_cat = dt_all_raw.drop("IsTrainTest", axis = 1).select_dtypes(include = ['object']).columns.values

#### 3.3.1 One-Hot encoding

In [9]:
dt_cat_onehot = pd.get_dummies(dt_all_raw[cols_cat])
dict_ohe = {x: "ohe_" + x for x in dt_cat_onehot.columns.values}
dt_cat_onehot = dt_cat_onehot.rename(columns = dict_ohe)

#### 3.3.2 TargetMean

In [10]:
# oof to encode cols_cat with TargetMean
def getTargetMean(dt, dt_all, cols, k = 3):

    # init dt_targetMean
    dt_targetMean = pd.DataFrame()

    for col in cols:
    
        # init dt_targetMean_oof
        dt_targetMean_oof = pd.DataFrame()
        
        # X_targetMean_Kfold, y_targetMean_Kfold
        X_targetMean_Kfold = dt[[col, "y"]]
        y_targetMean_Kfold = dt[col].values

        # oof cv
        skf = StratifiedKFold(n_splits = k)
        
        
        for i, (ind_in, ind_out) in enumerate(skf.split(X_targetMean_Kfold, y_targetMean_Kfold)):


            # init dt_targetMean_oof
            dt_targetMean_val = pd.DataFrame()

            # X_in, X_out, y_in, y_out
            X_in, X_out = X_targetMean_Kfold.iloc[ind_in], X_targetMean_Kfold.iloc[ind_out]
            y_in, y_out = y_targetMean_Kfold[ind_in], y_targetMean_Kfold[ind_out]

            # calc TargetMean
            for val in set(X_in[col].values):
                dt_targetMean_temp = pd.DataFrame({"Value": [val]
                                                   , "TargetMean_" + str(i): [X_in.loc[X_in[col] == val].y.mean()]})
                dt_targetMean_val = pd.concat([dt_targetMean_val, dt_targetMean_temp])

            # merge with oof
            if i == 0:
                dt_targetMean_oof = pd.merge(X_targetMean_Kfold.drop("y", axis = 1).drop_duplicates(), dt_targetMean_val
                                             , how = "left", left_on = col, right_on = "Value")
                dt_targetMean_oof = dt_targetMean_oof.drop(col, axis = 1)
            else:
                dt_targetMean_oof = pd.merge(dt_targetMean_oof, dt_targetMean_val
                                             , how = "left", on = "Value")

        # move Value to the first column
        value = dt_targetMean_oof['Value']
        dt_targetMean_oof.drop(labels = ["Value"], axis = 1,inplace = True)
        dt_targetMean_oof.insert(0, 'Value', value)
        # assign col
        dt_targetMean_oof.insert(0, 'Col', col)
        
        # concat with col
        dt_targetMean = pd.concat([dt_targetMean, dt_targetMean_oof])
    
    # mean of oof
    dt_targetMean["TargetMean"] = dt_targetMean.filter(regex = "TargetMean").mean(axis = 1)
    # fill zero
    dt_targetMean = dt_targetMean.fillna(0)
    # remove oof cols
    dt_targetMean = dt_targetMean[["Col", "Value", "TargetMean"]]

    # merge to original table
    dt_cat_cols = dt_all[cols]
    
    for col in cols_cat:
        dt_cat_cols = pd.merge(dt_cat_cols, dt_targetMean.loc[dt_targetMean["Col"] == col]
                           , how = "left", left_on = col, right_on = "Value")
        dt_cat_cols = dt_cat_cols.drop(["Value", "Col"], axis = 1)
        dt_cat_cols = dt_cat_cols.rename(columns = {"TargetMean": "TargetMean_" + col})
    # fill zero
    dt_cat_cols = dt_cat_cols.fillna(0)
    
    # only select targetMean cols
    dt_cat_targetMean = dt_cat_cols.filter(regex = "TargetMean")
        
    return dt_cat_targetMean

In [11]:
## TODO: remove the outlier
dt_cat_targetMean = getTargetMean(dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "train"]
                                  , dt_all_raw
                                  , cols_cat
                                  , 3)



In [12]:
params_ctrl = {
    "encode": "tm"
}

In [14]:
dt_all = dt_all_raw.drop(cols_cat, axis = 1)

if params_ctrl["encode"] == "ohe":
    dt_all = dt_all.join(dt_cat_onehot)
elif params_ctrl["encode"] == "tm":
    dt_all = dt_all.join(dt_cat_targetMean)
elif params_ctrl["encode"] == "all":
    dt_all = dt_all.join(dt_cat_onehot)
    dt_all = dt_all.join(dt_cat_targetMean)
    

## 4. Model

In [17]:
# r^2
def r_2(preds, dtrain):
    labels = dtrain.get_label()
    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)
    return 'score', r2_score(labels, preds)

### 4.1 ExtraTree

In [18]:
# extratree
params_extraTRee = {
    "n_jobs": 7
    , "n_estimators": 400
    , "max_depth": 3
    , "min_samples_split": 5
    , "random_state": 888
    , "verbose": 0
}
model_extra = ExtraTreesRegressor(**params_extraTRee)

### 4.2 xgboost

In [19]:
# params
params_xgb = {
    "eta": 0.05,
    "max_depth": 6,
    "subsample": 0.7,
    "colsample_bytree": 0.8,
    "objective": 'reg:linear',
    "silent": 0
}
num_boost_round = 1000

### 4.3 Linear Regression

In [20]:
model_lr = linear_model.LinearRegression()

### 4.4 Ridge Regression

In [21]:
model_ridge = linear_model.Ridge()
params_ridge = {'alpha': [0,0.5,1,2,3,5]}

## 5. Cross-Validation strategy

In [22]:
# X, y, ID
X_train_all = dt_all.loc[dt_all["IsTrainTest"] == "train"].drop(["ID", "y", "IsTrainTest"], axis = 1)
y_train_all = dt_all.loc[dt_all["IsTrainTest"] == "train"].y.values
ID_train_all = dt_all.loc[dt_all["IsTrainTest"] == "train"].ID.values
print("X_train_all:", X_train_all.shape)

X_test = dt_all.loc[dt_all["IsTrainTest"] == "test"].drop(["ID", "y", "IsTrainTest"], axis = 1)
y_test = dt_all.loc[dt_all["IsTrainTest"] == "test"].y.values
ID_test = dt_all.loc[dt_all["IsTrainTest"] == "test"].ID.values
print("X_test:", X_test.shape)

dtest = xgb.DMatrix(X_test)

X_train_all: (4209, 340)
X_test: (4209, 340)


### 5.1 Random split

In [23]:
score_rs_valid = 0
preds_rs_test = []
w_extra = .2
w_xgb = .5
w_ridge = .3
for i in range(0, 10):
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all
                                                          , test_size = 0.2, random_state = i)
    # extraTree
    print("extraTree ...")
    model_extra.fit(X_train, y_train)
    preds_extra_valid = model_extra.predict(X_valid)
    score_extra = r2_score(y_valid, preds_extra_valid)
    
    # xgboost
    print("xgboost ...")
    dtrain = xgb.DMatrix(X_train, label = y_train)
    dvalid = xgb.DMatrix(X_valid, label = y_valid)
    ls_watch =  [(dtrain, 'train'), (dvalid, 'eval')]
    model_xgb = xgb.train(params_xgb, dtrain, evals = ls_watch
                          , feval = r_2, maximize = True
                          , num_boost_round = num_boost_round
                          , early_stopping_rounds = 50, verbose_eval = 50)
    preds_xgb_valid = model_xgb.predict(dvalid)
    score_xgb = r2_score(y_valid, preds_xgb_valid)
    
    # ridge
    print("ridge ...")
    model_ridge = model_ridge.fit(X_train, y_train)
    preds_ridge_valid = model_ridge.predict(X_valid)
    score_ridge = r2_score(y_valid, preds_ridge_valid)

    # avg them
    ls_preds = [preds_extra_valid * w_extra, preds_xgb_valid * w_xgb, preds_ridge_valid * w_ridge]
    preds_rs_valid = [sum(e) for e in zip(*ls_preds)]
    
    score_rs = r2_score(y_valid, preds_rs_valid)
    
    score_rs_valid = score_rs_valid + score_rs / 10
    
    # test
    preds_extra_test = model_extra.predict(X_test)
    preds_xgb_test = model_xgb.predict(dtest)
    preds_ridge_test = model_ridge.predict(X_test)
    ls_preds_test = [preds_extra_test * w_extra, preds_xgb_test * w_xgb, preds_ridge_test * w_ridge]
    preds_rs_test = [sum(e) for e in zip(*ls_preds_test)]

    
    print("i: {} - extraTree:{}; xgb:{}; ridge:{}; rs_all:{}".format(i, round(score_extra, 5)
                                                                     , round(score_xgb, 5)
                                                                     , round(score_ridge, 5)
                                                                     , round(score_rs, 5)))

extraTree ...
xgboost ...
[0]	train-rmse:95.7692	eval-rmse:96.7695	train-score:-58.3968	eval-score:-49.5241
Multiple eval metrics have been passed: 'eval-score' will be used for early stopping.

Will train until eval-score hasn't improved in 50 rounds.
[50]	train-rmse:10.5872	eval-rmse:12.3243	train-score:0.274106	eval-score:0.1805
[100]	train-rmse:6.92934	eval-rmse:9.85729	train-score:0.689047	eval-score:0.475752
[150]	train-rmse:6.44888	eval-rmse:9.91306	train-score:0.730673	eval-score:0.469803
Stopping. Best iteration:
[104]	train-rmse:6.88784	eval-rmse:9.85072	train-score:0.69276	eval-score:0.476451

ridge ...
i: 0 - extraTree:0.46599; xgb:0.46988; ridge:0.44943; rs_all:0.47269
extraTree ...
xgboost ...
[0]	train-rmse:95.9972	eval-rmse:95.8197	train-score:-56.5739	eval-score:-55.2061
Multiple eval metrics have been passed: 'eval-score' will be used for early stopping.

Will train until eval-score hasn't improved in 50 rounds.
[50]	train-rmse:10.856	eval-rmse:11.1365	train-score:0.2

In [24]:
score_rs_valid

0.56204027614182095

In [25]:
preds_rs_test[:10]

[85.213667742744121,
 99.531250940998177,
 85.913894392027899,
 73.870440616461778,
 103.74431340502706,
 90.294677750122275,
 109.39597114724799,
 85.35882555027051,
 109.12219967748547,
 84.458915033932811]

### 5.2 Stratified Kfold

In [26]:
bin_y = pd.qcut(y_train_all, 5, labels = [1, 2, 3, 4, 5]).astype("int64")
# stratified kfold
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 888)

In [27]:
score_skf_valid = 0
preds_skf_test = []
w_extra = .2
w_xgb = .5
w_ridge = .3
for i, (ind_train, ind_valid) in enumerate(skf.split(X_train_all, y_train_all)):
    # X, y
    X_train, X_valid = X_train_all.iloc[ind_train], X_train_all.iloc[ind_valid]
    y_train, y_valid = y_train_all[ind_train], y_train_all[ind_valid]
    
    # extraTree
    print("extraTree ...")
    model_extra.fit(X_train, y_train)
    preds_extra_valid = model_extra.predict(X_valid)
    score_extra = r2_score(y_valid, preds_extra_valid)
    
    # xgboost
    print("xgboost ...")
    dtrain = xgb.DMatrix(X_train, label = y_train)
    dvalid = xgb.DMatrix(X_valid, label = y_valid)
    ls_watch =  [(dtrain, 'train'), (dvalid, 'eval')]
    model_xgb = xgb.train(params_xgb, dtrain, evals = ls_watch
                          , feval = r_2, maximize = True
                          , num_boost_round = num_boost_round
                          , early_stopping_rounds = 50, verbose_eval = 50)
    preds_xgb_valid = model_xgb.predict(dvalid)
    score_xgb = r2_score(y_valid, preds_xgb_valid)
    
    # ridge
    print("ridge ...")
    model_ridge = model_ridge.fit(X_train, y_train)
    preds_ridge_valid = model_ridge.predict(X_valid)
    score_ridge = r2_score(y_valid, preds_ridge_valid)

    # avg them
    ls_preds = ls_preds = [preds_extra_valid * w_extra, preds_xgb_valid * w_xgb, preds_ridge_valid * w_ridge]
    preds_skf_valid = [sum(e) for e in zip(*ls_preds)]
    
    score_skf = r2_score(y_valid, preds_skf_valid)
    
    score_skf_valid = score_skf_valid + score_skf / 5
    
    # test
    preds_extra_test = model_extra.predict(X_test)
    preds_xgb_test = model_xgb.predict(dtest)
    preds_ridge_test = model_ridge.predict(X_test)
    ls_preds_test = [preds_extra_test * w_extra, preds_xgb_test * w_xgb, preds_ridge_test * w_ridge]
    preds_skf_test = [sum(e) for e in zip(*ls_preds_test)]

    
    print("i: {} - extraTree:{}; xgb:{}; ridge:{}; rs_skf:{}".format(i, round(score_extra, 5)
                                                                     , round(score_xgb, 5)
                                                                     , round(score_ridge, 5)
                                                                     , round(score_skf, 5)))



extraTree ...
xgboost ...
[0]	train-rmse:95.8291	eval-rmse:96.5364	train-score:-58.7504	eval-score:-48.3363
Multiple eval metrics have been passed: 'eval-score' will be used for early stopping.

Will train until eval-score hasn't improved in 50 rounds.
[50]	train-rmse:10.5309	eval-rmse:12.8401	train-score:0.27843	eval-score:0.127182
[100]	train-rmse:6.82668	eval-rmse:10.2224	train-score:0.696775	eval-score:0.446791
[150]	train-rmse:6.30583	eval-rmse:10.1947	train-score:0.74128	eval-score:0.449785
Stopping. Best iteration:
[148]	train-rmse:6.31325	eval-rmse:10.193	train-score:0.74067	eval-score:0.449965

ridge ...
i: 0 - extraTree:0.44301; xgb:0.44653; ridge:0.40864; rs_skf:0.44556
extraTree ...
xgboost ...
[0]	train-rmse:96.0274	eval-rmse:95.7299	train-score:-56.5026	eval-score:-55.5468
Multiple eval metrics have been passed: 'eval-score' will be used for early stopping.

Will train until eval-score hasn't improved in 50 rounds.
[50]	train-rmse:10.8714	eval-rmse:11.3303	train-score:0.2

In [28]:
score_skf_valid

0.56628415732660486

In [29]:
preds_skf_test[:10]

[80.195698018641636,
 94.05956884989078,
 79.529289163218309,
 72.432689800340185,
 102.30247426655764,
 90.84288777707502,
 108.06186825185007,
 85.435602739896751,
 108.08160322843642,
 85.589350104280001]

## 6. Submit

In [30]:
wt_rs = .3
wt_skf = .7
ls_submit_preds_test = [np.array(preds_rs_test) * wt_rs, np.array(preds_skf_test) * wt_skf]
preds_submit_test = [sum(e) for e in zip(*ls_submit_preds_test)]
dt_submit = pd.DataFrame({"ID": ID_test, "y": preds_submit_test})

In [31]:
dt_submit[["ID", "y"]].to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/submission/2_initModel_tm_cvrs_cvskf_3_7_modextra_modxgb_modridge_2_5_3.csv"
                              , index = False)