In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from scipy.stats import uniform as sp_rand
from scipy.stats import randint as sp_randint
import random
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt
import pickle

## 7. Experiment

In [2]:
# data
dt_model = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_preprocess.csv")
params_xgb = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/params_xgb_05.pkl", "rb"))
cols_raw = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/cols_raw.pkl", "rb"))

In [3]:
dt_model.shape

(8418, 1575)

In [7]:
params_xgb = {'booster': 'dart',
 'colsample_bylevel': 0.8,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.002,
 'max_depth': 2,
 'min_child_weight': 1,
 'n_estimators': 3000,
 'reg_alpha': 0.01,
 'reg_lambda': 0.01,
 'subsample': 0.88}

In [8]:
# r_2 for xgboost
def r_2(preds, dtrain):
    labels = dtrain.get_label()
    return 'score', r2_score(labels, preds)

# make scorer_r2
scorer_r2 = make_scorer(r2_score)

### Feature set selection

In [None]:
# feature set select
list_cols = ["single_train", "single_test", "dup_train", "dup_test", "Is"
             , "Encode_ohe", "Encode_TargetMean", "Encode_Freq", "Encode_Binary", "Encode_Ordinal"
             , "FeatEng_OutlierMarker", "FeatEng_SumBin"
             , "DR_PCA_Raw_Bin", "DR_PCA_Encoded_Cat", "DR_PCA_FeatEng", "DR_PCA_All"
             , "DR_ICA_Raw_Bin", "DR_ICA_Encoded", "DR_ICA_FeatEng", "DR_ICA_All"
             , "DR_SVD_Raw_Bin", "DR_SVD_Encoded", "DR_SVD_FeatEng", "DR_SVD_All"
             , "DR_FA_Raw_Bin", "DR_FA_Encoded", "DR_FA_FeatEng", "DR_FA_All"
             , "DR_TSNE_Raw_Bin", "DR_TSNE_Encoded", "DR_TSNE_FeatEng", "DR_TSNE_All"
             , "DR_GRP_Raw_Bin", "DR_GRP_Encoded", "DR_GRP_FeatEng", "DR_GRP_All"
             , "DR_SRP_Raw_Bin", "DR_SRP_Encoded", "DR_SRP_FeatEng", "DR_SRP_All"]

In [None]:
cols_core = dt_model[["ID", "y", "IsTrainTest"]].join(dt_model[cols_raw]).columns.values
dt_model_core = dt_model[cols_core]
dt_model_nonCore = dt_model.drop(cols_core, axis = 1)

In [None]:
list_featureSelect = []
for i in range(0, 100):
    # random sample
    rand_sample = sp_randint(round(len(list_cols) / 2), len(list_cols) + 1).rvs(1)[0]
    list_cols_random = random.sample(list_cols, rand_sample)
    print("list_cols_random", list_cols_random)
    regex_cols_random = "|".join(list_cols_random)
    dt_model_nonCore_sample = dt_model_nonCore.filter(regex = regex_cols_random)
    # join with random sample
    dt_model_sample = dt_model_core.join(dt_model_nonCore_sample)
    print("shape", dt_model_sample.shape)
    # X, y, ID
    X_train_all = dt_model_sample.loc[dt_model_sample["IsTrainTest"] == "train"].drop(["y", "IsTrainTest"], axis = 1)
    X_test = dt_model_sample.loc[dt_model_sample["IsTrainTest"] == "test"].drop(["y", "IsTrainTest"], axis = 1)
    y_train_all = dt_model_sample.loc[dt_model_sample["IsTrainTest"] == "train"].y.values
    y_test = dt_model_sample.loc[dt_model_sample["IsTrainTest"] == "test"].y.values
    ID_train_all = dt_model_sample.loc[dt_model_sample["IsTrainTest"] == "train"].ID.values
    ID_test = dt_model_sample.loc[dt_model_sample["IsTrainTest"] == "test"].ID.values
    # xgbDmatrix
    dmx_train_all = xgb.DMatrix(X_train_all, label = y_train_all)
    dmx_test = xgb.DMatrix(X_test)
    # base_score
    params_xgb["base_score"] = np.mean(y_train_all)
    # cv
    cv_xgb = xgb.cv(params_xgb, dmx_train_all
                    , num_boost_round = 2000
                    , nfold = 10
                    , feval = r_2, maximize = True, early_stopping_rounds = 50
                    , show_stdv = True, verbose_eval = 50)
    # record
    score = cv_xgb["test-score-mean"][cv_xgb.shape[0] - 1]
    sd = cv_xgb["test-score-std"][cv_xgb.shape[0] - 1]
    dict_featureSelect = {"score": score
                         , "sd": sd
                         , "nonCoreFeatures": list_cols_random}
    list_featureSelect.append(dict_featureSelect)

In [None]:
score_max = np.max([x["score"] for x in list_featureSelect])
cols_nonCore_max = [x["nonCoreFeatures"] for x in list_featureSelect if x["score"] == score_max][0]

In [None]:
regex_cols_nonCore_max = "|".join(cols_nonCore_max)

### Produce model

In [None]:
# dt_model_features = dt_model_core.join(dt_model_nonCore.filter(regex = regex_cols_nonCore_max))
# dt_model_features.shape

In [None]:
# remove ohe and targetMean
# dt_model_features = dt_model.drop(dt_model.filter(regex = "Encode_ohe|Encode_Freq|Encode_Binary|FeatEng_SumBin|single_train|dup_train").columns, axis = 1)
# dt_model_features.shape

In [9]:
# remove ohe
dt_model_features = dt_model.drop(dt_model.filter(regex = "Encode_ohe").columns, axis = 1)
dt_model_features.shape

(8418, 1364)

In [10]:
# X, y, ID
X_train_all = dt_model_features.loc[dt_model_features["IsTrainTest"] == "train"].drop(["y", "IsTrainTest"], axis = 1)
X_test = dt_model_features.loc[dt_model_features["IsTrainTest"] == "test"].drop(["y", "IsTrainTest"], axis = 1)
y_train_all = dt_model_features.loc[dt_model_features["IsTrainTest"] == "train"].y.values
y_test = dt_model_features.loc[dt_model_features["IsTrainTest"] == "test"].y.values
ID_train_all = dt_model_features.loc[dt_model_features["IsTrainTest"] == "train"].ID.values
ID_test = dt_model_features.loc[dt_model_features["IsTrainTest"] == "test"].ID.values

In [11]:
X_train_all.shape

(4209, 1362)

In [12]:
# xgbDmatrix
dmx_train_all = xgb.DMatrix(X_train_all, label = y_train_all)
dmx_test = xgb.DMatrix(X_test)

In [13]:
params_xgb["base_score"] = np.mean(y_train_all)

In [None]:
k = 10
# bin
bin_y = pd.qcut(y_train_all, k, labels = [i for i in range(1, k + 1)]).astype("int64")
# stratified kfold
skf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 888)

score_skf_valid = 0
score_skf = 0
preds_skf_test = []
for i, (ind_train, ind_valid) in enumerate(skf.split(X_train_all, bin_y)):
    # X, y
    X_train, X_valid = X_train_all.iloc[ind_train], X_train_all.iloc[ind_valid]
    y_train, y_valid = y_train_all[ind_train], y_train_all[ind_valid]
    # xgb.DMatrix
    dmx_train_all = xgb.DMatrix(X_train, label = y_train)
    dmx_valid_all = xgb.DMatrix(X_valid, label = y_valid)
    ls_watch =  [(dmx_train_all, 'train'), (dmx_valid_all, 'eval')]
    
    model_xgb = xgb.train(params_xgb, dmx_train_all, evals = ls_watch
                          , num_boost_round = 5000
                          , feval = r_2, maximize = True, early_stopping_rounds = 50
                          , verbose_eval = 50
                         )
    
    preds_valid = model_xgb.predict(dmx_valid_all)
    score_skf_valid = r2_score(y_valid, preds_valid)
    print('Fold %d: Score %f'%(i, score_skf_valid))

    score_skf += score_skf_valid

score_skf /= k
print('=====================')

print( 'Final Score %f'%score_skf)

print('=====================')

In [14]:
# cv
cv_xgb = xgb.cv(params_xgb, dmx_train_all
                , num_boost_round = 5000
                , nfold = 10
                , feval = r_2, maximize = True, early_stopping_rounds = 50
                , show_stdv = True, verbose_eval = 50)

[0]	train-rmse:12.6576+0.0961838	train-score:0.0022173+4.75774e-05	test-rmse:12.6311+0.823434	test-score:-0.000621+0.00260034
[50]	train-rmse:11.9954+0.103439	train-score:0.103902+0.00228085	test-rmse:11.974+0.866461	test-score:0.10146+0.0143064
[100]	train-rmse:11.4226+0.110398	train-score:0.187442+0.00402342	test-rmse:11.4252+0.905444	test-score:0.182423+0.0246178
[150]	train-rmse:10.926+0.115645	train-score:0.256565+0.00523459	test-rmse:10.9549+0.941865	test-score:0.248703+0.0330751
[200]	train-rmse:10.4968+0.118888	train-score:0.313832+0.00598899	test-rmse:10.5495+0.977396	test-score:0.303536+0.0403759
[250]	train-rmse:10.1268+0.12073	train-score:0.361346+0.0064454	test-rmse:10.2036+1.01078	test-score:0.348616+0.0465239
[300]	train-rmse:9.80864+0.121059	train-score:0.400847+0.00664142	test-rmse:9.90759+1.04138	test-score:0.385944+0.0517393
[350]	train-rmse:9.53609+0.120232	train-score:0.43368+0.00666003	test-rmse:9.65642+1.06976	test-score:0.416713+0.056189
[400]	train-rmse:9.30302

KeyboardInterrupt: 

In [None]:
cv_xgb.tail()

In [None]:
# train
model_xgb = xgb.train(params_xgb, dmx_train_all, num_boost_round = cv_xgb.shape[0])

In [None]:
# importance
fig, ax = plt.subplots(figsize = (12,18))
xgb.plot_importance(model_xgb, max_num_features = 50, height = 0.8, ax = ax)
plt.show()

### Submit

In [None]:
# predict
y_test = model_xgb.predict(dmx_test)
y_test[:10]

In [None]:
# submit
dt_submit = pd.DataFrame({"ID": ID_test, "y": y_test})

In [None]:
dt_submit.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/submission/15_added_FI_2way_XOR.csv", index = False)