In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, train_test_split
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/train.csv")
test = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/test.csv")

In [3]:
print(train.shape, test.shape)

(4209, 378) (4209, 377)


## Encode

In [4]:
cols_cat = train.select_dtypes(include = ['object']).columns.values
cols_cat = cols_cat.tolist()
cols_bin = train.select_dtypes(include = ['int64']).columns.values
cols_bin = cols_bin[cols_bin != "ID"].tolist()

In [5]:
for c in cols_cat:
    x = list(set(list(train[c].values) + list(test[c].values))) 
    x.sort()
    x.sort(key = len)
    dt_labelEncode_c = pd.DataFrame({"Encode_Label_" + c: [i for i in range(1, (len(x) + 1))]
                                     , c: x})

    train = pd.merge(train, dt_labelEncode_c, on = c)
    test = pd.merge(test, dt_labelEncode_c, on = c)

## TargetMean

In [6]:
def getTargetMean(dt_train, dt_test, cols, k = 3, random_state = 888):

    if k == 1:
        train_cp = dt_train.copy()
        test_cp = dt_test.copy()
        for c in cols:
            x = train_cp.groupby([c])["y"].mean()
            dt_targetMean_c = pd.DataFrame({c: x.index
                                           , "Encode_TargetMean_" + c: x.values})
            train_cp = pd.merge(dt_targetMean_c, train_cp, on = c)

            test_cp = pd.merge(dt_targetMean_c, test_cp, on = c, how = "right")
            test_cp = test_cp.fillna(np.mean(train_cp.y))

        return train_cp, test_cp
    else:
        for col in cols:
            X_train_fold = pd.DataFrame()
            X_test_fold = pd.DataFrame()

            skf = StratifiedKFold(n_splits = k, random_state = random_state)

            for i, (ind_in, ind_out) in enumerate(skf.split(dt_train, dt_train[col].values)):
                X_in, X_out = dt_train.iloc[ind_in], dt_train.iloc[ind_out]
                # targetMean in
                dt_targetMean_fold = pd.DataFrame({col: X_in.groupby([col])["y"].mean().index
                                                  , "Encode_TargetMean_" + col: X_in.groupby([col])["y"].mean()})
                # merge targetMean out
                X_out_fold = pd.merge(X_out, dt_targetMean_fold, on = col, how = "left")
                X_out_fold = X_out_fold.fillna(np.mean(X_in.y))

                # concat X_out_fold
                X_train_fold = pd.concat([X_train_fold, X_out_fold])

                # merge with test
                dt_targetMean_fold = dt_targetMean_fold.rename(columns = {"Encode_TargetMean_" + col: "Encode_TargetMean_fold_" + col + "_" + str(i)})
                if i == 0:
                    X_test_fold = pd.merge(dt_test, dt_targetMean_fold, on = col, how = "left")
                else:
                    X_test_fold = pd.merge(X_test_fold, dt_targetMean_fold, on = col, how = "left")

                # mean for test
                cols_encode_fold = X_test_fold.filter(regex = "Encode_TargetMean_fold_").columns.values
                X_test_fold["Encode_TargetMean_" + col] = X_test_fold[cols_encode_fold].mean(axis = 1)
                X_test_fold = X_test_fold.drop(cols_encode_fold, axis = 1)
                X_test_fold = X_test_fold.fillna(np.mean(X_in.y))
    
    return X_train_fold, X_test_fold 


In [7]:
train, test = getTargetMean(train, test, cols_cat, 1)

In [8]:
cols_targetMean = train.filter(regex = "Encode_TargetMean_").columns.values.tolist()

In [9]:
print(train.shape, test.shape)

(4209, 394) (4209, 393)


## OutlierMaker

In [10]:
def getOutlierMarker(dt_train, dt_test, perc = 100):
    
    dt_train_cp = dt_train.copy()
    dt_test_cp = dt_test.copy()
    
    # outlier
    bin_y = pd.qcut(dt_train_cp.y.values, perc, labels = [i for i in range(1, perc + 1)]).astype("int64")
    dt_outliers = dt_train_cp.iloc[bin_y == perc]
    
    cols_types = ["Cat", "Bin", "All", "X0", "X5"]
    for cols_type in cols_types:
        if cols_type == "Cat":
            cols = cols_cat
        elif cols_type == "Bin":
            cols = cols_bin
        elif cols_type == "All":
            cols = cols_cat + cols_bin
        else:
            cols = [cols_type]
            
        # calc int_outlierMarker_train, int_outlierMarker_test
        int_outlierMarker_train = np.zeros(dt_train_cp.shape[0])
        int_outlierMarker_test = np.zeros(dt_test_cp.shape[0])
        for i in range(1, dt_outliers.shape[0] + 1):
            dt_outliers_i = dt_outliers.iloc[(i - 1):i, :][cols]

            # calc int_outlierMarker_train_i
            int_outlierMarker_train_i = np.zeros(dt_train_cp.shape[0])
            for col in dt_outliers_i.columns.values:
                for val in dt_outliers_i[col].values:
                    int_outlierMarker_train_i += (dt_train_cp[col].values == val).astype("int64")
            # calc int_outlierMarker_test_i
            int_outlierMarker_test_i = np.zeros(dt_test_cp.shape[0])
            for col in dt_outliers_i.columns.values:
                for val in dt_outliers_i[col].values:
                    int_outlierMarker_test_i += (dt_test_cp[col].values == val).astype("int64")

            int_outlierMarker_train += int_outlierMarker_train_i
            int_outlierMarker_test += int_outlierMarker_test_i
    
        # add as column
        dt_train_cp.loc[:, "FeatEng_OutlierMaker_" + cols_type] = int_outlierMarker_train
        dt_test_cp.loc[:, "FeatEng_OutlierMaker_" + cols_type] = int_outlierMarker_test
        
    return dt_train_cp, dt_test_cp

## Dimension Reduce

In [11]:
# standardize
train_norm = StandardScaler().fit_transform(train.drop(["y"] + cols_cat + cols_targetMean, axis=1))
test_norm = StandardScaler().fit_transform(test.drop(cols_cat + cols_targetMean, axis = 1))

In [12]:
n_comp = 12
n_comp_pca = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train_norm)
tsvd_results_test = tsvd.transform(test_norm)

# PCA
pca = PCA(n_components=n_comp_pca, random_state=420)
pca2_results_train = pca.fit_transform(train_norm)
pca2_results_test = pca.transform(test_norm)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train_norm)
ica2_results_test = ica.transform(test_norm)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train_norm)
grp_results_test = grp.transform(test_norm)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train_norm)
srp_results_test = srp.transform(test_norm)


In [13]:
# Append decomposition components to datasets
for i in range(1, n_comp + 1):

    train['ICA_' + str(i)] = ica2_results_train[:, i - 1]
    test['ICA_' + str(i)] = ica2_results_test[:, i - 1]

    train['TSVD_' + str(i)] = tsvd_results_train[:, i - 1]
    test['TSVD_' + str(i)] = tsvd_results_test[:, i - 1]

    train['GRP_' + str(i)] = grp_results_train[:, i - 1]
    test['GRP_' + str(i)] = grp_results_test[:, i - 1]

    train['SRP_' + str(i)] = srp_results_train[:, i - 1]
    test['SRP_' + str(i)] = srp_results_test[:, i - 1]

In [14]:
for i in range(1, n_comp_pca + 1):
    
    train['PCA_' + str(i)] = pca2_results_train[:, i - 1]
    test['PCA_' + str(i)] = pca2_results_test[:, i - 1]

In [15]:
print(train.shape, test.shape)

(4209, 454) (4209, 453)


## xgboost

In [16]:
# r_2 for xgboost
def r_2(preds, dtrain):
    labels = dtrain.get_label()
    return 'score', r2_score(labels, preds)

In [None]:
# y_train_all
y_train_all = train.y.values

In [None]:
k = 8
# bin
bin_y = pd.qcut(y_train_all, k, labels = [i for i in range(1, k + 1)]).astype("int64")
# stratified kfold
skf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 888)

score_skf_valid = 0
score_skf = []
preds_skf_test = []
dt_preds_test = []
for i, (ind_train, ind_valid) in enumerate(skf.split(train, bin_y)):
    # X, y
    X_train, X_valid = train.iloc[ind_train], train.iloc[ind_valid]
    
    # featEng: TargetMean, Outlier
    ## train, valid
#     X_train_transform, X_valid_transform = getTargetMean(X_train, X_valid, cols_cat, 1) # targetMean
#     X_train_transform, X_valid_transform = getOutlierMarker(X_train_transform, X_valid_transform, 600) # outlierMaker
#     y_train_transform, y_valid_transform = X_train_transform.y.values, X_valid_transform.y.values
#     X_train_transform, X_valid_transform = X_train_transform.drop(["y"] + cols_cat, axis = 1), X_valid_transform.drop(["y"] + cols_cat, axis = 1)
    y_train_transform, y_valid_transform = X_train.y.values, X_valid.y.values
    X_train_transform, X_valid_transform = X_train.drop(["y"] + cols_cat, axis = 1), X_valid.drop(["y"] + cols_cat, axis = 1)
    
    ## test
#     _, X_test_transform = getTargetMean(X_train, test, cols_cat, 1) # targetMean
#     _, X_test_transform = getOutlierMarker(X_train, X_test_transform, 600) # outlierMaker
#     X_test_transform = X_test_transform.drop(cols_cat, axis = 1)
    X_test_transform = test.drop(cols_cat, axis = 1)
    
    # xgb.DMatrix
    dmx_train = xgb.DMatrix(X_train_transform, label = y_train_transform)
    dmx_valid = xgb.DMatrix(X_valid_transform, label = y_valid_transform)
    dmx_test = xgb.DMatrix(X_test_transform)
    ls_watch =  [(dmx_train, 'train'), (dmx_valid, 'eval')]
    
    # params
    params_xgb = {
        "objective": "reg:linear"
        , "booster": "gbtree"
        , "learning_rate": 0.005
        , "subsample": .9
        , "colsample": .8
        , "max_depth": 2
        , "alpha": 1
        , "lambda": 2
        , "gamma": 20
        , "base_score": np.mean(y_train_transform)
    }
    
    # model
    model_xgb = xgb.train(params_xgb, dmx_train, evals = ls_watch
                          , num_boost_round = 5000
                          , feval = r_2, maximize = True, early_stopping_rounds = 50
                          , verbose_eval = False
                         )
    
    # predict
    preds_valid = model_xgb.predict(dmx_valid)
    preds_test = model_xgb.predict(dmx_test)
    preds_skf_test.append(preds_test)
    dt_preds_test.append(pd.DataFrame({"ID": X_test_transform.ID
                                      , "preds_y_" + str(i): preds_test}))
    # score
    score_skf_valid = r2_score(y_valid_transform, preds_valid)
    print('Fold %d: Score %f'%(i, score_skf_valid))
    score_skf.append(score_skf_valid)

# predict test
preds_test = np.sum(np.transpose(np.multiply(np.transpose(np.array(preds_skf_test)), np.array(score_skf))), axis = 0) / np.sum(score_skf)
# final score
score_mean = np.mean(score_skf)
score_sd = np.std(score_skf)
print('=====================')

print('Final Score %f'%score_mean, '; sd %f'%score_sd)

print('=====================')

In [None]:
# importance
fig, ax = plt.subplots(figsize = (12,18))
xgb.plot_importance(model_xgb, max_num_features = 50, height = 0.8, ax = ax)
plt.show()

In [None]:
preds_test

In [None]:
dt_preds_test[3].head()

In [None]:
# submit
dt_submit = pd.DataFrame({"ID": dt_preds_test[0].ID
            , "y": preds_test})

In [None]:
dt_submit.to_csv("../../data/Mercedes_Benz_Greener_Manufacturing/submission/35_returnToPython_skf8_weightedPrediction_base_features_withTargetMeanFullOutsideSkf_12DR.csv", index = False)

In [1]:
# scores for xgboost on the DR components
xgb_dr_scores = [('pca', 0.67711803427267436), ('ica', 0.67586920454698141), ('tsvd', 0.6773481959197214), ('grp', 0.66224169180347559), ('srp', 0.66096135085216134), ('nmf', 0.67108736044682338), ('fag', 0.65904803827641634), ('pca,ica', 0.68245377669904106), ('pca,tsvd', 0.68279088835428392), ('pca,grp', 0.6787867994366088), ('pca,srp', 0.67792025638275744), ('pca,nmf', 0.67824271112992407), ('pca,fag', 0.67760115601615789), ('ica,tsvd', 0.6835942695318834), ('ica,grp', 0.67714387407053067), ('ica,srp', 0.67588373704965066), ('ica,nmf', 0.67972571931753245), ('ica,fag', 0.67610154710950088), ('tsvd,grp', 0.67832326077226301), ('tsvd,srp', 0.67772724917614169), ('tsvd,nmf', 0.68019973796858046), ('tsvd,fag', 0.67644006589507888), ('grp,srp', 0.6639663861228402), ('grp,nmf', 0.67298350211656932), ('grp,fag', 0.66285185530531221), ('srp,nmf', 0.67115462237246204), ('srp,fag', 0.66067639917962073), ('nmf,fag', 0.67045554585744727), ('pca,ica,tsvd', 0.68654114499013574), ('pca,ica,grp', 0.6845591569950219), ('pca,ica,srp', 0.68341568938247299), ('pca,ica,nmf', 0.68297606561083835), ('pca,ica,fag', 0.68308499474673812), ('pca,tsvd,grp', 0.68378339313413505), ('pca,tsvd,srp', 0.68356776992801049), ('pca,tsvd,nmf', 0.68352377920860341), ('pca,tsvd,fag', 0.68279271098255034), ('pca,grp,srp', 0.67966645954138882), ('pca,grp,nmf', 0.67943469252666477), ('pca,grp,fag', 0.67866040661124249), ('pca,srp,nmf', 0.67956276141791694), ('pca,srp,fag', 0.67791893370687739), ('pca,nmf,fag', 0.67810864643697355), ('ica,tsvd,grp', 0.6836705557289231), ('ica,tsvd,srp', 0.68444852137165446), ('ica,tsvd,nmf', 0.68499119061570446), ('ica,tsvd,fag', 0.68291988532496473), ('ica,grp,srp', 0.67826068686261565), ('ica,grp,nmf', 0.6803462107453162), ('ica,grp,fag', 0.67798837067985895), ('ica,srp,nmf', 0.67967578214024238), ('ica,srp,fag', 0.67733790019653739), ('ica,nmf,fag', 0.67949754170543986), ('tsvd,grp,srp', 0.67923008622903458), ('tsvd,grp,nmf', 0.68049437195624363), ('tsvd,grp,fag', 0.67884475754668472), ('tsvd,srp,nmf', 0.68040112239110107), ('tsvd,srp,fag', 0.67760402916832718), ('tsvd,nmf,fag', 0.67878244082676908), ('grp,srp,nmf', 0.67330304619738812), ('grp,srp,fag', 0.66499311514336668), ('grp,nmf,fag', 0.67272305180560688), ('srp,nmf,fag', 0.67060795522667327), ('pca,ica,tsvd,grp', 0.6879912555299923), ('pca,ica,tsvd,srp', 0.68824790746797515), ('pca,ica,tsvd,nmf', 0.6874142163999073), ('pca,ica,tsvd,fag', 0.6874916336682666), ('pca,ica,grp,srp', 0.68495477428381735), ('pca,ica,grp,nmf', 0.68444395334924479), ('pca,ica,grp,fag', 0.68445968450240868), ('pca,ica,srp,nmf', 0.68359817404064549), ('pca,ica,srp,fag', 0.68427438373791882), ('pca,ica,nmf,fag', 0.68372534635518956), ('pca,tsvd,grp,srp', 0.68522627464232233), ('pca,tsvd,grp,nmf', 0.68482997343399277), ('pca,tsvd,grp,fag', 0.68402325359793226), ('pca,tsvd,srp,nmf', 0.68451110170905172), ('pca,tsvd,srp,fag', 0.68376345836481944), ('pca,tsvd,nmf,fag', 0.68398845947727116), ('pca,grp,srp,nmf', 0.68057096050452826), ('pca,grp,srp,fag', 0.67936781899772836), ('pca,grp,nmf,fag', 0.67993008733246818), ('pca,srp,nmf,fag', 0.67880641218391258), ('ica,tsvd,grp,srp', 0.68585503201750986), ('ica,tsvd,grp,nmf', 0.68589078222575661), ('ica,tsvd,grp,fag', 0.68442858274152085), ('ica,tsvd,srp,nmf', 0.68625869472926981), ('ica,tsvd,srp,fag', 0.68459204233889182), ('ica,tsvd,nmf,fag', 0.68543928703685419), ('ica,grp,srp,nmf', 0.68184531802094139), ('ica,grp,srp,fag', 0.67948113256494302), ('ica,grp,nmf,fag', 0.68125384869666428), ('ica,srp,nmf,fag', 0.68044986028224808), ('tsvd,grp,srp,nmf', 0.68173625535126525), ('tsvd,grp,srp,fag', 0.67963365320087199), ('tsvd,grp,nmf,fag', 0.68106208115398559), ('tsvd,srp,nmf,fag', 0.68015617202501455), ('grp,srp,nmf,fag', 0.67436036079206474), ('pca,ica,tsvd,grp,srp', 0.68874171502676662), ('pca,ica,tsvd,grp,nmf', 0.68836418911595743), ('pca,ica,tsvd,grp,fag', 0.68823671222257321), ('pca,ica,tsvd,srp,nmf', 0.68718360137081413), ('pca,ica,tsvd,srp,fag', 0.68784143657029118), ('pca,ica,tsvd,nmf,fag', 0.68702839094409329), ('pca,ica,grp,srp,nmf', 0.68542880395930061), ('pca,ica,grp,srp,fag', 0.68580831435591105), ('pca,ica,grp,nmf,fag', 0.68520397140448863), ('pca,ica,srp,nmf,fag', 0.6838153499053613), ('pca,tsvd,grp,srp,nmf', 0.68475418776952801), ('pca,tsvd,grp,srp,fag', 0.68416888261697117), ('pca,tsvd,grp,nmf,fag', 0.68439534926539514), ('pca,tsvd,srp,nmf,fag', 0.68369054844962107), ('pca,grp,srp,nmf,fag', 0.68109487520163592), ('ica,tsvd,grp,srp,nmf', 0.68703175972715047), ('ica,tsvd,grp,srp,fag', 0.68605627429688421), ('ica,tsvd,grp,nmf,fag', 0.68625025013109198), ('ica,tsvd,srp,nmf,fag', 0.68508823305152089), ('ica,grp,srp,nmf,fag', 0.6818608839534277), ('tsvd,grp,srp,nmf,fag', 0.68121711681246844), ('pca,ica,tsvd,grp,srp,nmf', 0.68899654938766397), ('pca,ica,tsvd,grp,srp,fag', 0.68892201689977894), ('pca,ica,tsvd,grp,nmf,fag', 0.68839400442292953), ('pca,ica,tsvd,srp,nmf,fag', 0.68827193462597602), ('pca,ica,grp,srp,nmf,fag', 0.6851006771786381), ('pca,tsvd,grp,srp,nmf,fag', 0.68488975768813187), ('ica,tsvd,grp,srp,nmf,fag', 0.68648750730778907), ('pca,ica,tsvd,grp,srp,nmf,fag', 0.68823319136312078)]

In [5]:
max(xgb_dr_scores, key = lambda item:item[1])

('pca,ica,tsvd,grp,srp,nmf', 0.688996549387664)