In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, train_test_split

In [2]:
train = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/train.csv")
test = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/test.csv")

## Encode

In [3]:
cols_cat = train.select_dtypes(include = ['object']).columns.values
cols_cat = cols_cat.tolist()

In [4]:
for c in cols_cat:
    x = list(set(list(train[c].values) + list(test[c].values))) 
    x.sort()
    x.sort(key = len)
    dt_labelEncode_c = pd.DataFrame({"Encode_Label_" + c: [i for i in range(1, (len(x) + 1))]
                                     , c: x})

    train = pd.merge(train, dt_labelEncode_c, on = c)
    test = pd.merge(test, dt_labelEncode_c, on = c)

## TargetMean

In [5]:
def getTargetMean(train, test, cols_cat):
    train_cp = train.copy()
    test_cp = test.copy()
    for c in cols_cat:
        x = train_cp.groupby([c])["y"].mean()
        dt_targetMean_c = pd.DataFrame({c: x.index
                                       , "TargetMean_" + c: x.values})
        train_cp = pd.merge(dt_targetMean_c, train_cp, on = c)

        test_cp = pd.merge(dt_targetMean_c, test_cp, on = c, how = "right")
        test_cp = test_cp.fillna(np.mean(train_cp.y))
    
    return train_cp, test_cp

## Dimension Reduce

In [6]:
n_comp = 15

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"] + cols_cat, axis=1))
tsvd_results_test = tsvd.transform(test.drop(cols_cat, axis = 1))

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"] + cols_cat, axis=1))
pca2_results_test = pca.transform(test.drop(cols_cat, axis = 1))

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"] + cols_cat, axis=1))
ica2_results_test = ica.transform(test.drop(cols_cat, axis = 1))

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"] + cols_cat, axis=1))
grp_results_test = grp.transform(test.drop(cols_cat, axis = 1))

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"] + cols_cat, axis=1))
srp_results_test = srp.transform(test.drop(cols_cat, axis = 1))


In [7]:
# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    
    train['PCA_' + str(i)] = pca2_results_train[:, i - 1]
    test['PCA_' + str(i)] = pca2_results_test[:, i - 1]

    train['ICA_' + str(i)] = ica2_results_train[:, i - 1]
    test['ICA_' + str(i)] = ica2_results_test[:, i - 1]

    train['TSVD_' + str(i)] = tsvd_results_train[:, i - 1]
    test['TSVD_' + str(i)] = tsvd_results_test[:, i - 1]

    train['GRP_' + str(i)] = grp_results_train[:, i - 1]
    test['GRP_' + str(i)] = grp_results_test[:, i - 1]

    train['SRP_' + str(i)] = srp_results_train[:, i - 1]
    test['SRP_' + str(i)] = srp_results_test[:, i - 1]

## xgboost

In [8]:
# r_2 for xgboost
def r_2(preds, dtrain):
    labels = dtrain.get_label()
    return 'score', r2_score(labels, preds)

In [9]:
# y_train_all
y_train_all = train.y.values

In [10]:
k = 8
# bin
bin_y = pd.qcut(y_train_all, k, labels = [i for i in range(1, k + 1)]).astype("int64")
# stratified kfold
skf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 888)

score_skf_valid = 0
score_skf = []
preds_skf_test = []
dt_preds_test = []
for i, (ind_train, ind_valid) in enumerate(skf.split(train, bin_y)):
    # X, y
    X_train, X_valid = train.iloc[ind_train], train.iloc[ind_valid]
    
    # featEng: TargetMean
    # train, valid
    X_train_transform, X_valid_transform = getTargetMean(X_train, X_valid, cols_cat)
    y_train_transform, y_valid_transform = X_train_transform.y.values, X_valid_transform.y.values
    X_train_transform, X_valid_transform = X_train_transform.drop(["y"] + cols_cat, axis = 1), X_valid_transform.drop(["y"] + cols_cat, axis = 1)
    # test
    _, X_test_transform = getTargetMean(X_train, test, cols_cat)
    X_test_transform = X_test_transform.drop(cols_cat, axis = 1)
    
    # xgb.DMatrix
    dmx_train = xgb.DMatrix(X_train_transform, label = y_train_transform)
    dmx_valid = xgb.DMatrix(X_valid_transform, label = y_valid_transform)
    dmx_test = xgb.DMatrix(X_test_transform)
    ls_watch =  [(dmx_train, 'train'), (dmx_valid, 'eval')]
    
    # params
    params_xgb = {
        "objective": "reg:linear"
        , "booster": "gbtree"
        , "learning_rate": 0.005
        , "subsample": .9
        , "colsample": .8
        , "max_depth": 2
        , "alpha": 1
        , "lambda": 2
        , "gamma": 20
        , "base_score": np.mean(y_train_transform)
    }
    
    # model
    model_xgb = xgb.train(params_xgb, dmx_train, evals = ls_watch
                          , num_boost_round = 5000
                          , feval = r_2, maximize = True, early_stopping_rounds = 50
                          , verbose_eval = False
                         )
    
    # predict
    preds_valid = model_xgb.predict(dmx_valid)
    preds_test = model_xgb.predict(dmx_test)
    preds_skf_test.append(preds_test)
    dt_preds_test.append(pd.DataFrame({"ID": X_test_transform.ID
                                      , "preds_y_" + str(i): preds_test}))
    # score
    score_skf_valid = r2_score(y_valid_transform, preds_valid)
    print('Fold %d: Score %f'%(i, score_skf_valid))
    score_skf.append(score_skf_valid)

# predict test
preds_test = np.sum(np.transpose(np.multiply(np.transpose(np.array(preds_skf_test)), np.array(score_skf))), axis = 0) / np.sum(score_skf)
# final score
score_mean = np.mean(score_skf)
score_sd = np.std(score_skf)
print('=====================')

print('Final Score %f'%score_mean, '; sd %f'%score_sd)

print('=====================')

Fold 0: Score 0.551663
Fold 1: Score 0.609463
Fold 2: Score 0.596596
Fold 3: Score 0.417429
Fold 4: Score 0.573308
Fold 5: Score 0.596935
Fold 6: Score 0.627133
Fold 7: Score 0.627989
Final Score 0.575064 ; sd 0.064262


In [15]:
preds_test

array([  94.81409386,  103.41935342,  104.03800851, ...,  104.72428974,
        103.04756022,  103.4487657 ])

In [19]:
dt_preds_test[3].head()

Unnamed: 0,ID,preds_y_3
0,1228,94.616135
1,4342,103.305962
2,5299,103.687073
3,6774,94.23526
4,7631,102.378174


In [20]:
# submit
dt_submit = pd.DataFrame({"ID": dt_preds_test[0].ID
            , "y": preds_test})

In [21]:
dt_submit.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/submission/32_returnToPython_skf8_weightedPrediction_base_features_withTargetMeanInsideSkf.csv", index = False)