In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.insert(0, "utils/")
import pickle

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, FastICA, TruncatedSVD, NMF
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import FeatureAgglomeration
from gplearn.genetic import SymbolicRegressor, SymbolicTransformer
from IPython.display import Image
import pydotplus

In [4]:
from scipy.stats import randint as sp_randint

In [5]:
import xgboost as xgb
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.gaussian_process import GaussianProcessRegressor

In [6]:
# utils
from clean import *
from encode import *
from featureEngineer import *
from model import *

## 1. Load

In [7]:
# load data
dt_train_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/train.csv")
dt_test_raw = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/raw/test.csv")

In [8]:
# ids fold
dt_id_folds = pd.read_csv("../../data/Mercedes_Benz_Greener_Manufacturing/folds/dt_id_folds.csv")

In [9]:
print(dt_train_raw.shape, dt_test_raw.shape)

(4209, 378) (4209, 377)


## 2. transform

In [10]:
# ids
ids_train = dt_train_raw.ID.values
ids_test = dt_test_raw.ID.values

In [11]:
# concat
dt_all = pd.concat([dt_train_raw, dt_test_raw])
# merge folds
dt_all = pd.merge(dt_all, dt_id_folds, how = "left", on = "ID")

In [12]:
print(dt_all.shape)

(8418, 379)


## 3. Clean

In [13]:
# duplicated cols in dt_all
cols_dup_all_toDrop = dt_all.T.duplicated()[dt_all.T.duplicated() == True].index.values
dt_all = dt_all.drop(cols_dup_all_toDrop, axis = 1)

In [14]:
# cols
cols_bin = dt_all.drop("ID", axis = 1).select_dtypes(include = ["int64"]).columns
cols_bin = cols_bin.tolist()
cols_cat = dt_all.drop("ID", axis = 1).select_dtypes(include = ["object"]).columns
cols_cat = cols_cat.tolist()

In [15]:
cols_comp = removeCompCols(dt_all, cols_bin)

X128 X130
X156 X157
X204 X205
X232 X263


In [16]:
dt_all = dt_all.drop(cols_comp, axis = 1)

In [17]:
# cols
cols_bin = dt_all.drop("ID", axis = 1).select_dtypes(include = ["int64"]).columns
cols_bin = cols_bin.tolist()

In [18]:
print(dt_all.shape)

(8418, 339)


## 4. Encode

### 4.1 OHE

In [19]:
dt_cat_onehot = pd.get_dummies(dt_all[cols_cat])
dict_ohe = {x: "Encode_ohe_" + x for x in dt_cat_onehot.columns.values}
dt_cat_onehot = dt_cat_onehot.rename(columns = dict_ohe)
dt_all = dt_all.join(dt_cat_onehot)

In [20]:
dt_all.shape

(8418, 550)

### 4.2 Ordered Label

In [21]:
dt_all = encode_orderedLabel(dt_all, cols_cat)

In [22]:
dt_all.shape

(8418, 558)

## 5. Feature Engineer

### 5.1 DR

In [23]:
n_comp = 12
dt_all = getDR(dt_all, n_comp)

In [24]:
dt_all.shape

(8418, 642)

### 5.2 outlierDist

In [25]:
dt_all = outlierDist(dt_all, ids_train, ids_test, cols_cat, cols_bin)

In [26]:
dt_all.shape

(8418, 647)

## X. Model

In [27]:
dt_all_train = dt_all[dt_all.ID.isin(ids_train)]
dt_all_test = dt_all[dt_all.ID.isin(ids_test)]

In [28]:
print(dt_all_train.shape, dt_all_test.shape)

(4209, 647) (4209, 647)


### X.1. xgb

In [None]:
res_all = []
for i in range(1, 100):
    # params
    params_cols = ["ohe", "label"
                   , "dr_tsvd", "dr_pca", "dr_ica", "dr_grp", "dr_srp", "dr_nmf", "dr_fag", "outlierDist"
                   , "targetMean", "targetMeanX0", "targetMeanX5", "symbolicTransformer"]
    values_cols = np.random.choice([True, False], len(params_cols))
    params_cols_dict = dict(zip(params_cols, values_cols))
    
    # feat select
    res = featureSelect("xgb", dt_all_train, cols_cat, cols_bin, params_cols_dict)
    res_all.append(res)
params_cols_dict_best_xgb = max(res_all, key = lambda item: item[1])[0]

In [None]:
pickle.dump(params_cols_dict_best_xgb, open( "../../data/Mercedes_Benz_Greener_Manufacturing/data/params_cols_dict_best_xgb.pkl", "wb"))

### X.2 elasticNet

In [64]:
res_all = []
for i in range(1, 3):
    # params
    params_cols = ["ohe", "label"
                   , "dr_tsvd", "dr_pca", "dr_ica", "dr_grp", "dr_srp", "dr_nmf", "dr_fag", "outlierDist"
                   , "targetMean", "targetMeanX0", "targetMeanX5", "symbolicTransformer"]
    values_cols = np.random.choice([True, False], len(params_cols))
    params_cols_dict = dict(zip(params_cols, values_cols))
    
    # feat select
    res = featureSelect("elasticNet", dt_all_train, cols_cat, cols_bin, params_cols_dict)
    res_all.append(res)
params_cols_dict_best_elasticNet = max(res_all, key = lambda item: item[1])[0]

Final Score 0.569637
0.569636500567
Final Score 0.569591
0.569591469044


In [None]:
pickle.dump(params_cols_dict_best_elasticNet, open( "../../data/Mercedes_Benz_Greener_Manufacturing/data/params_cols_dict_best_elasticNet.pkl", "wb"))

### X.3 SVR

In [None]:
# res_all = []
# for i in range(1, 5):
#     # params
#     params_cols = ["ohe", "label"
#                    , "dr_tsvd", "dr_pca", "dr_ica", "dr_grp", "dr_srp", "dr_nmf", "dr_fag", "outlierDist"
#                    , "targetMean", "targetMeanX0", "targetMeanX5", "symbolicTransformer"]
#     values_cols = np.random.choice([True, False], len(params_cols))
#     params_cols_dict = dict(zip(params_cols, values_cols))
    
#     # feat select
#     res = featureSelect("svr", dt_all_train, cols_cat, cols_bin, params_cols_dict)
#     res_all.append(res)
# params_cols_dict_best_svr = max(res_all, key = lambda item: item[0][1])[0]

### X.X Blending

In [29]:
import pickle
# params_cols
params_cols_dict_best_xgb = pickle.load(open("/media/noahhhhhh/dataScience/proj/competition/data/Mercedes_Benz_Greener_Manufacturing/data/params_cols_dict_best_xgb.pkl", "rb"))
params_cols_dict_best_elasticNet = pickle.load(open("/media/noahhhhhh/dataScience/proj/competition/data/Mercedes_Benz_Greener_Manufacturing/data/params_cols_dict_best_elasticNet.pkl", "rb"))

In [30]:
_, _, dt_preds_xgb = featureSelect("xgb", dt_all_train, cols_cat, cols_bin, params_cols_dict_best_xgb[0])
_, _, dt_preds_elasticNet = featureSelect("elasticNet", dt_all_train, cols_cat, cols_bin, params_cols_dict_best_elasticNet[0])
_, _, dt_preds_knn = featureSelect("knn", dt_all_train, cols_cat, cols_bin, params_cols_dict_best_xgb[0])

Final Score 0.572045
0.572045317979
Final Score 0.570877
0.570877330891
Final Score 0.555127
0.555126914398


In [40]:
dt_blending = pd.merge(pd.merge(dt_preds_xgb, dt_preds_elasticNet, on = ["ID", "y"]), dt_preds_knn, on = ["ID", "y"])

In [51]:
preds_blending = dt_blending.preds_x * .6 + dt_blending.preds_y * .4 + dt_blending.preds * 0

In [52]:
r2_score(dt_blending.y.values, preds_blending)

0.57437178318185378