In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from scipy.stats import uniform as sp_rand
from scipy.stats import randint as sp_randint
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt
import pickle

## 6. Model

In [2]:
# data
dt_model = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_eng.csv")
# IDs
IDs_train = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/IDs_train.pkl", "rb"))
IDs_test = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/IDs_test.pkl", "rb"))
# cols
cols_cat = pickle.load(open("../../data/Mercedes_Benz_Greener_Manufacturing/data/cols_cat.pkl", "rb"))

In [3]:
# r_2 for xgboost
def r_2(preds, dtrain):
    labels = dtrain.get_label()
    return 'score', r2_score(labels, preds)

In [4]:
# remove ohe
dt_model_features = dt_model.drop(dt_model.filter(regex = "Encode_ohe").columns, axis = 1)
dt_model_features.shape

(8418, 1865)

In [5]:
# X, y, ID
X_train_all = dt_model_features.loc[dt_model_features["ID"].isin(IDs_train)].drop(["y"], axis = 1)
X_test = dt_model_features.loc[dt_model_features["ID"].isin(IDs_test)].drop(["y"], axis = 1)
y_train_all = dt_model_features.loc[dt_model_features["ID"].isin(IDs_train)].y.values
y_test = dt_model_features.loc[dt_model_features["ID"].isin(IDs_test)].y.values

ValueError: labels ['X0' 'X1' 'X2' 'X3' 'X4' 'X5' 'X6' 'X8'] not contained in axis

In [None]:
ID_test = dt_model_features.loc[dt_model_features["ID"].isin(IDs_test)].ID.values

In [None]:
X_train_all.shape

### 6.1 xgboost

In [None]:
params_xgb = {
    "objective": "reg:linear"
    , "booster": "gbtree"
    , "learning_rate": 0.005
    , "subsample": .9
    , "colsample": .8
    , "max_depth": 2
    , "alpha": 1
    , "lambda": 2
    , "gamma": 20
    , "base_score": np.mean(y_train_all)
}

In [None]:
# xgbDmatrix
dtrain = xgb.DMatrix(X_train_all, y_train_all)
dtest = xgb.DMatrix(X_test)

In [None]:
# cv
cv_xgb = xgb.cv(params_xgb, dtrain
                , num_boost_round = 5000
                , nfold = 10
                , feval = r_2, maximize = True, early_stopping_rounds = 50
                , show_stdv = True, verbose_eval = 50, seed = 888)

In [None]:
# train
model_xgb = xgb.train(params_xgb, dtrain, num_boost_round = cv_xgb.shape[0])

In [None]:
# importance
fig, ax = plt.subplots(figsize = (12,18))
xgb.plot_importance(model_xgb, max_num_features = 50, height = 0.8, ax = ax)
plt.show()

In [None]:
# predict
y_test = model_xgb.predict(dtest)
y_test[:10]

In [None]:
# submit
dt_submit = pd.DataFrame({"ID": ID_test, "y": y_test})
dt_submit.head()

In [None]:
dt_submit.shape

In [None]:
# dt_submit.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/submission/19_cleaned_python.csv", index = False)