In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
##Add decomposed components: PCA / ICA etc.
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold

## 1. Load

In [2]:
# load data
dt_train_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/train.csv")
dt_test_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/test.csv")

In [3]:
dt_train_raw.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
print(dt_train_raw.shape, dt_test_raw.shape)

(4209, 378) (4209, 377)


## 2. Transform

In [5]:
# # marke train and test set
# dt_train_raw.loc[:, "IsTrainTest"] = pd.Series("train", index = dt_train_raw.index)
# dt_test_raw.loc[:, "IsTrainTest"] = pd.Series("test", index = dt_test_raw.index)

# # change test index and add y
# dt_test_raw.index = dt_test_raw.index + max(dt_train_raw.index) + 1
# dt_test_raw.loc[:,"y"] = pd.Series([0.0] * dt_test_raw.shape[0], index = dt_test_raw.index)

# # concat
# dt_test_raw = dt_test_raw[dt_train_raw.columns.values]
# dt_all_raw = pd.concat([dt_train_raw, dt_test_raw])

In [6]:
dt_all_raw.shape

(8418, 379)

## 3. Encode

In [7]:
# cols_cat
cols_cat = dt_all_raw.drop("IsTrainTest", axis = 1).select_dtypes(include = ['object']).columns.values
dt_cat = dt_all_raw[cols_cat]

In [8]:
for c in cols_cat:
    x = list(set(dt_cat[c].values))
    x.sort()
    x.sort(key = len)
    dt_labelEncode_c = pd.DataFrame({"Encode_" + c: [i for i in range(1, (len(x) + 1))]
                                     , c: x})
    
    dt_cat = pd.merge(dt_cat, dt_labelEncode_c, on = c)
    

In [9]:
dt_cat.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,Encode_X0,Encode_X1,Encode_X2,Encode_X3,Encode_X4,Encode_X5,Encode_X6,Encode_X8
0,k,v,at,a,d,u,j,o,11,22,45,1,4,20,10,15
1,o,l,ae,f,d,p,j,o,15,12,30,6,4,15,10,15
2,bc,v,ac,f,d,p,j,o,53,22,28,6,4,15,10,15
3,ak,v,ak,f,d,ac,j,o,36,22,36,6,4,28,10,15
4,s,aa,ay,g,d,ac,j,o,19,26,50,7,4,28,10,15


In [10]:
dt_all_raw = dt_all_raw.drop(cols_cat, axis = 1).join(dt_cat.filter(regex = "Encode"))

In [11]:
dt_all_raw.head()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X385,IsTrainTest,Encode_X0,Encode_X1,Encode_X2,Encode_X3,Encode_X4,Encode_X5,Encode_X6,Encode_X8
0,0,130.81,0,0,0,1,0,0,0,0,...,0,train,11,22,45,1,4,20,10,15
1,6,88.53,0,0,0,0,0,0,0,0,...,0,train,15,12,30,6,4,15,10,15
2,7,76.26,0,0,0,0,0,0,0,1,...,0,train,53,22,28,6,4,15,10,15
3,9,80.62,0,0,0,0,0,0,0,0,...,0,train,36,22,36,6,4,28,10,15
4,13,78.02,0,0,0,0,0,0,0,0,...,0,train,19,26,50,7,4,28,10,15


## 3. Decomposition

In [12]:
# # range
# pp_range = MinMaxScaler()
# mx_range = pp_range.fit_transform(dt_all_raw.drop(["y", "IsTrainTest"], axis = 1))
# dt_all_raw = dt_all_raw[["y", "IsTrainTest"]].join(pd.DataFrame(mx_range, columns = dt_all_raw.drop(["y", "IsTrainTest"], axis = 1).columns.values))
# dt_all_raw.shape

In [13]:
dt_all_raw.head()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X385,IsTrainTest,Encode_X0,Encode_X1,Encode_X2,Encode_X3,Encode_X4,Encode_X5,Encode_X6,Encode_X8
0,0,130.81,0,0,0,1,0,0,0,0,...,0,train,11,22,45,1,4,20,10,15
1,6,88.53,0,0,0,0,0,0,0,0,...,0,train,15,12,30,6,4,15,10,15
2,7,76.26,0,0,0,0,0,0,0,1,...,0,train,53,22,28,6,4,15,10,15
3,9,80.62,0,0,0,0,0,0,0,0,...,0,train,36,22,36,6,4,28,10,15
4,13,78.02,0,0,0,0,0,0,0,0,...,0,train,19,26,50,7,4,28,10,15


In [14]:
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_all = tsvd.fit_transform(dt_all_raw.drop(["y", "IsTrainTest"], axis=1))

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_all = pca.fit_transform(dt_all_raw.drop(["y", "IsTrainTest"], axis=1))

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_all = ica.fit_transform(dt_all_raw.drop(["y", "IsTrainTest"], axis=1))

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_all = grp.fit_transform(dt_all_raw.drop(["y", "IsTrainTest"], axis=1))

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_all = srp.fit_transform(dt_all_raw.drop(["y", "IsTrainTest"], axis=1))

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    dt_all_raw['pca_' + str(i)] = tsvd_results_all[:,i-1]

    dt_all_raw['ica_' + str(i)] = pca2_results_all[:,i-1]

    dt_all_raw['tsvd_' + str(i)] = ica2_results_all[:,i-1]

    dt_all_raw['grp_' + str(i)] = grp_results_all[:,i-1]

    dt_all_raw['srp_' + str(i)] = srp_results_all[:,i-1]

y_train = dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "train"]["y"]
y_mean = np.mean(y_train)

In [15]:
dt_all_raw.head()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,pca_11,ica_11,tsvd_11,grp_11,srp_11,pca_12,ica_12,tsvd_12,grp_12,srp_12
0,0,130.81,0,0,0,1,0,0,0,0,...,2.515031,-0.930268,-0.00631,3.28768,-1.272022,0.153663,1.019926,-0.015611,-10.294343,0.0
1,6,88.53,0,0,0,0,0,0,0,0,...,0.235961,-1.443387,-0.006462,1.424336,-2.544045,-1.203211,-0.686309,0.003356,-5.953444,0.0
2,7,76.26,0,0,0,0,0,0,0,1,...,2.775975,-2.120312,-0.011648,-10.721902,-3.816067,0.912201,2.327417,-0.027712,-22.255752,-1.272022
3,9,80.62,0,0,0,0,0,0,0,0,...,1.768263,-2.291198,-0.004773,-3.492486,-1.272022,1.142124,2.663389,-0.027521,-27.742691,-1.272022
4,13,78.02,0,0,0,0,0,0,0,0,...,1.437391,-2.237315,-0.002673,3.845533,-1.272022,1.431072,2.835668,-0.028378,-25.663512,-1.272022


## 4. Feature Engineering

### 4.1 Outlier Maker

In [None]:
def getOutlierMarker(dt, index_outlier = 883):
    # outlier
    dt_outlier = dt.loc[dt.index.values == index_outlier]
    # calc int_outlierMarker
    int_outlierMarker = np.zeros(dt.shape[0])
    for col in dt_outlier.columns.values:
        for val in dt_outlier[col].values:
            int_outlierMarker = int_outlierMarker + (dt[col].values == val).astype("int64")
    
    return int_outlierMarker

#### 4.1.1 OutlierMarker_Cat

In [None]:
cols_cat = dt_all_cleaned.drop(["ID", "y", "IsTrainTest"], axis = 1).select_dtypes(include = ['object']).columns.values
cols_cat

In [None]:
int_outlierMarker_cat = getOutlierMarker(dt_all_cleaned[cols_cat])

#### 4.1.2 OutlierMarker_Int

In [None]:
cols_int = dt_all_cleaned.drop(["y", "IsTrainTest"], axis = 1).select_dtypes(include = ['int64']).columns.values
cols_int[:10]

In [None]:
int_outlierMarker_int = getOutlierMarker(dt_all_cleaned[cols_int])

#### 4.1.3 OutlierMarker_All

In [None]:
cols_all = dt_all_cleaned.drop(["y", "IsTrainTest"], axis = 1).columns.values
cols_all[:10]

In [None]:
int_outlierMarker_all = getOutlierMarker(dt_all_cleaned[cols_all])

#### 4.1.4 OutlierMarker_X0

In [None]:
cols_X0 = "X0"
int_outlierMarker_x0 = getOutlierMarker(dt_all_cleaned[cols_X0].to_frame())

In [None]:
dt_all_eng.loc[:, "FeatEng_OutlierMarker_Cat"] = int_outlierMarker_cat
dt_all_eng.loc[:, "FeatEng_OutlierMarker_Int"] = int_outlierMarker_int
dt_all_eng.loc[:, "FeatEng_OutlierMarker_All"] = int_outlierMarker_all
dt_all_eng.loc[:, "FeatEng_OutlierMarker_X0"] = int_outlierMarker_x0

In [None]:
dt_all_eng.head()

## 5. xgboost

In [16]:
# r_2 for xgboost
def r_2(preds, dtrain):
    labels = dtrain.get_label()
    return 'score', r2_score(labels, preds)

In [17]:
params_xgb = {
    'n_trees': 520,
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.98,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

In [18]:
# xgbDmatrix
dtrain = xgb.DMatrix(dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "train"].drop(["y", "IsTrainTest"], axis=1), y_train)
dtest = xgb.DMatrix(dt_all_raw.loc[dt_all_raw["IsTrainTest"] == "test"].drop(["y", "IsTrainTest"], axis=1))

In [19]:
# cv
cv_xgb = xgb.cv(params_xgb, dtrain
                , num_boost_round = 5000
                , nfold = 10
                , feval = r_2, maximize = True, early_stopping_rounds = 50
                , show_stdv = True, verbose_eval = 50, seed = 888)

[0]	train-rmse:12.6514+0.0889206	train-score:0.0052506+6.01867e-05	test-rmse:12.6291+0.778512	test-score:0.0026855+0.00293256
[50]	train-rmse:11.2211+0.0897473	train-score:0.217474+0.00213483	test-rmse:11.2618+0.816861	test-score:0.207792+0.019268
[100]	train-rmse:10.1916+0.0921852	train-score:0.354476+0.00348496	test-rmse:10.3023+0.856574	test-score:0.337513+0.0311377
[150]	train-rmse:9.46312+0.0959265	train-score:0.443465+0.00441368	test-rmse:9.64801+0.896142	test-score:0.419217+0.0395517
[200]	train-rmse:8.95468+0.0995465	train-score:0.501663+0.00502357	test-rmse:9.2135+0.933736	test-score:0.470416+0.0458668
[250]	train-rmse:8.60186+0.101588	train-score:0.540159+0.00536719	test-rmse:8.92747+0.965633	test-score:0.502735+0.0507717
[300]	train-rmse:8.35447+0.102537	train-score:0.566226+0.0055668	test-rmse:8.73574+0.987344	test-score:0.523775+0.0539318
[350]	train-rmse:8.17921+0.103116	train-score:0.584234+0.00567128	test-rmse:8.61432+1.00468	test-score:0.53681+0.0564645
[400]	train-rms