In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
##Add decomposed components: PCA / ICA etc.
from sklearn.decomposition import PCA, FastICA, FactorAnalysis
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## 1. Load

In [2]:
# load data
train = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/train.csv")
test = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/test.csv")

In [3]:
print(train.shape, test.shape)

NameError: name 'dt_train_raw' is not defined

## 2. TargetMean

In [None]:
cols_cat = train.select_dtypes(include = ['object']).columns.values

In [None]:
for c in cols_cat:
    x = train.groupby([c])["y"].mean()
    dt_targetMean_c = pd.DataFrame({c: x.index
                                   , "TargetMean_" + c: x.values})
    train = pd.merge(dt_targetMean_c, train, on = c)
    test = pd.merge(dt_targetMean_c, test, on = c)

In [None]:
train.iloc[:5, :20]

In [None]:
test.iloc[:5, :10]

In [None]:
col_targetMean = train.filter(regex = "TargetMean_").columns.values

## 3. Encode

### 3.1 Ordered Label

In [None]:
dt_cat_train_y = train[cols_cat].join(train.y)
dt_cat_test = train[cols_cat].join(train.y)

In [None]:
for c in cols_cat:
    x = list(set(list(train[c].values) + list(test[c].values))) 
    x.sort()
    x.sort(key = len)
    dt_labelEncode_c = pd.DataFrame({"Encode_Label_" + c: [i for i in range(1, (len(x) + 1))]
                                     , c: x})

    train = pd.merge(train, dt_labelEncode_c, on = c)
    test = pd.merge(test, dt_labelEncode_c, on = c)

In [None]:
train.iloc[:5, 0:20]

In [None]:
cols_encodeLabel = train.filter(regex = "EncodeLabel_").columns.values

### 3.2 OHE

In [None]:
# dt_all = pd.concat([train.drop(["y"], axis = 1), test])
# dt_cat_onehot = pd.get_dummies(dt_all[cols_cat])
# dict_ohe = {x: "Encode_OHE_" + x for x in dt_cat_onehot.columns.values}
# dt_cat_onehot = dt_cat_onehot.rename(columns = dict_ohe)

In [None]:
train.head()

In [None]:
# train = train.join(dt_cat_onehot.iloc[:train.shape[0]])
# test = test.join(dt_cat_onehot.iloc[train.shape[0]:])

In [None]:
train = train.drop(cols_cat, axis = 1)
test = test.drop(cols_cat, axis = 1)

## 3. Decomposition

In [None]:
n_comp = 15

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(cols_encodeLabel.tolist() + ["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# FA
fa = FactorAnalysis(n_components=n_comp, random_state=420)
fa_results_train = fa.fit_transform(train.drop(cols_encodeLabel.tolist() + ["y"], axis=1))
fa_results_test = fa.transform(test.drop(cols_encodeLabel, axis = 1))

In [None]:
#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]
    
    train['fa_' + str(i)] = fa_results_train[:, i - 1]
    test['fa_' + str(i)] = fa_results_test[:, i - 1]


In [None]:
train.shape

## 4. Feature Engineering

### 4.1 Outlier Maker

In [None]:
dt_featEng = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/data/dt_all_eng.csv")

In [None]:
train = pd.merge(train, dt_featEng.filter(regex = "Outlier|ID").iloc[:train.shape[0]], on = "ID")
test = pd.merge(test, dt_featEng.filter(regex = "Outlier|ID").iloc[:train.shape[0]], on = "ID")

In [None]:
train.head()

In [None]:
dt_all_eng.head()

### 4.2 Feature Interaction

In [None]:
train = pd.merge(train, dt_featEng.filter(regex = "FeatEng_FI|ID").iloc[:train.shape[0]], on = "ID")
test = pd.merge(test, dt_featEng.filter(regex = "FeatEng_FI|ID").iloc[:train.shape[0]], on = "ID")

In [None]:
train.head()

### 4.3 Sum Binary

In [None]:
train = pd.merge(train, dt_featEng.filter(regex = "FeatEng_SumBin|ID").iloc[:train.shape[0]], on = "ID")
test = pd.merge(test, dt_featEng.filter(regex = "FeatEng_SumBin|ID").iloc[:train.shape[0]], on = "ID")

## 5. xgboost

In [None]:
# r_2 for xgboost
def r_2(preds, dtrain):
    labels = dtrain.get_label()
    return 'score', r2_score(labels, preds)

# make scorer_r2
scorer_r2 = make_scorer(r2_score)

In [None]:
y_train = train['y'].values
y_mean = np.mean(y_train)

In [None]:
params_xgb = {
    'n_trees': 5000,
    'eta': 0.005,
    'max_depth': 2,
    'min_child_weight': 0,
    'subsample': 0.98,
    'objective': 'reg:linear',
    'booster': 'gbtree',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1,
}

In [None]:
# xgbDmatrix
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

In [None]:
# cv
cv_xgb = xgb.cv(params_xgb, dtrain
                , num_boost_round = 5000
                , nfold = 10
                , feval = r_2, maximize = True, early_stopping_rounds = 50
                , show_stdv = True, verbose_eval = 50, seed = 888)

In [None]:
# train
model_xgb = xgb.train(params_xgb, dtrain, num_boost_round = cv_xgb.shape[0])

In [None]:
# importance
fig, ax = plt.subplots(figsize = (12,18))
xgb.plot_importance(model_xgb, max_num_features = 50, height = 0.8, ax = ax)
plt.show()