In [1]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# For training model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

#for analysis
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm


import warnings

In [6]:
# Load the training data
train = pd.read_csv("30-days-of-ml/train_folds.csv")
test = pd.read_csv("30-days-of-ml/test.csv")
sample_submission=pd.read_csv("30-days-of-ml/sample_submission.csv")

# Preview the data
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,4
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,0
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,1
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,0


In [9]:
useful_cols=[column for column in train.columns if column not in ('id', 'target', 'kfold')]

In [10]:
train[useful_cols]

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,C,N,...,0.610706,0.400361,0.160266,0.310921,0.389470,0.267559,0.237281,0.377873,0.322401,0.869850
1,B,B,A,A,B,D,A,F,A,O,...,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083
2,A,A,A,C,B,D,A,D,A,F,...,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846
3,B,B,A,C,B,D,A,E,C,K,...,0.284667,0.668980,0.239061,0.732948,0.679618,0.574844,0.346010,0.714610,0.540150,0.280682
4,A,A,A,C,B,D,A,E,A,N,...,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,B,B,A,A,B,D,A,E,A,I,...,0.307883,0.769792,0.450538,0.934360,1.005077,0.853726,0.422541,1.063463,0.697685,0.506404
299996,A,B,A,C,B,B,A,E,E,F,...,0.736713,0.528056,0.508502,0.358247,0.257825,0.433525,0.301015,0.268447,0.577055,0.823611
299997,B,B,A,C,B,C,A,E,G,F,...,0.277074,0.688747,0.372425,0.364936,0.383224,0.551825,0.661007,0.629606,0.714139,0.245732
299998,A,B,A,C,B,B,A,E,E,I,...,0.805963,0.344404,0.424243,0.382028,0.468819,0.351036,0.288768,0.611169,0.380254,0.332030


In [15]:
 # List of categorical columns
object_cols = [col for col in useful_cols if 'cat' in col]
test=test[useful_cols]


# Random Forest

In [18]:
last_scores=[]
for fold in range(5):
    X_train = train[train.kfold!=fold].reset_index(drop=True)
    X_valid= train[train.kfold==fold].reset_index(drop=True)
    X_test=test.copy()
    
    y_train=X_train.target
    y_valid=X_valid.target
    
    X_train=X_train[useful_cols]
    X_valid=X_valid[useful_cols]
   
    # ordinal-encode categorical columns
    print('Encoding')
    ordinal_encoder = OrdinalEncoder()
    
    X_train[object_cols]=ordinal_encoder.fit_transform(X_train[object_cols])
    X_valid[object_cols]=ordinal_encoder.transform(X_valid[object_cols])
    X_test[object_cols] = ordinal_encoder.transform(X_test[object_cols])
    
    print('Training') 
    model = RandomForestRegressor(random_state=fold, verbose=100)
    model.fit(X_train,y_train)
    predictions=model.predict(X_valid)
    predictions_test=model.predict(X_test)
    last_scores.append(predictions_test)
    print(fold, mean_squared_error(y_valid,predictions,squared=False))

Encoding
Training
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
building tree 1 of 100
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.9s remaining:    0.0s
building tree 2 of 100
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   17.7s remaining:    0.0s
building tree 3 of 100
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   26.6s remaining:    0.0s
building tree 4 of 100
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   35.5s remaining:    0.0s
building tree 5 of 100
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   44.7s remaining:    0.0s
building tree 6 of 100
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   53.8s remaining:    0.0s
building tree 7 of 100
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.0min remaining:    0.0s
building tree 8 of 100
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.2min remaining:    0.0s
building tree 9 of 100
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  

[Parallel(n_jobs=1)]: Done  79 out of  79 | elapsed: 11.9min remaining:    0.0s
building tree 80 of 100
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 12.1min remaining:    0.0s
building tree 81 of 100
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed: 12.2min remaining:    0.0s
building tree 82 of 100
[Parallel(n_jobs=1)]: Done  82 out of  82 | elapsed: 12.4min remaining:    0.0s
building tree 83 of 100
[Parallel(n_jobs=1)]: Done  83 out of  83 | elapsed: 12.5min remaining:    0.0s
building tree 84 of 100
[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed: 12.7min remaining:    0.0s
building tree 85 of 100
[Parallel(n_jobs=1)]: Done  85 out of  85 | elapsed: 12.8min remaining:    0.0s
building tree 86 of 100
[Parallel(n_jobs=1)]: Done  86 out of  86 | elapsed: 13.0min remaining:    0.0s
building tree 87 of 100
[Parallel(n_jobs=1)]: Done  87 out of  87 | elapsed: 13.2min remaining:    0.0s
building tree 88 of 100
[Parallel(n_jobs=1)]: Done  88 out of  88 | elapsed: 13.3min rem

[Parallel(n_jobs=1)]: Done  77 out of  77 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  78 out of  78 | elapsed:    3.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  79 out of  79 | elapsed:    3.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    4.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:    4.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  82 out of  82 | elapsed:    4.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  83 out of  83 | elapsed:    4.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  85 out of  85 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  86 out of  86 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  87 out of  87 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  88 out of  88 | elapsed:    4.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  89 out of  8

[Parallel(n_jobs=1)]: Done  79 out of  79 | elapsed:   13.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   13.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:   13.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  82 out of  82 | elapsed:   13.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  83 out of  83 | elapsed:   13.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:   13.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  85 out of  85 | elapsed:   14.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  86 out of  86 | elapsed:   14.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  87 out of  87 | elapsed:   14.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  88 out of  88 | elapsed:   14.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  89 out of  89 | elapsed:   14.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   14.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  91 out of  9

[Parallel(n_jobs=1)]: Done  62 out of  62 | elapsed:  9.4min remaining:    0.0s
building tree 63 of 100
[Parallel(n_jobs=1)]: Done  63 out of  63 | elapsed:  9.5min remaining:    0.0s
building tree 64 of 100
[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed:  9.7min remaining:    0.0s
building tree 65 of 100
[Parallel(n_jobs=1)]: Done  65 out of  65 | elapsed:  9.8min remaining:    0.0s
building tree 66 of 100
[Parallel(n_jobs=1)]: Done  66 out of  66 | elapsed: 10.0min remaining:    0.0s
building tree 67 of 100
[Parallel(n_jobs=1)]: Done  67 out of  67 | elapsed: 10.1min remaining:    0.0s
building tree 68 of 100
[Parallel(n_jobs=1)]: Done  68 out of  68 | elapsed: 10.3min remaining:    0.0s
building tree 69 of 100
[Parallel(n_jobs=1)]: Done  69 out of  69 | elapsed: 10.4min remaining:    0.0s
building tree 70 of 100
[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed: 10.6min remaining:    0.0s
building tree 71 of 100
[Parallel(n_jobs=1)]: Done  71 out of  71 | elapsed: 10.7min rem

[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:    2.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  58 out of  58 | elapsed:    2.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  59 out of  59 | elapsed:    2.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    3.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  61 out of  61 | elapsed:    3.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  62 out of  62 | elapsed:    3.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  63 out of  63 | elapsed:    3.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  65 out of  6

[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:    8.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:    9.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed:    9.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  58 out of  58 | elapsed:    9.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  59 out of  59 | elapsed:    9.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    9.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  61 out of  61 | elapsed:    9.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  62 out of  62 | elapsed:   10.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  63 out of  63 | elapsed:   10.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed:   10.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  65 out of  65 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  66 out of  66 | elapsed:   10.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  67 out of  6

[Parallel(n_jobs=1)]: Done  44 out of  44 | elapsed:  6.8min remaining:    0.0s
building tree 45 of 100
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  7.0min remaining:    0.0s
building tree 46 of 100
[Parallel(n_jobs=1)]: Done  46 out of  46 | elapsed:  7.1min remaining:    0.0s
building tree 47 of 100
[Parallel(n_jobs=1)]: Done  47 out of  47 | elapsed:  7.3min remaining:    0.0s
building tree 48 of 100
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  7.4min remaining:    0.0s
building tree 49 of 100
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:  7.6min remaining:    0.0s
building tree 50 of 100
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  7.7min remaining:    0.0s
building tree 51 of 100
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  7.9min remaining:    0.0s
building tree 52 of 100
[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:  8.0min remaining:    0.0s
building tree 53 of 100
[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:  8.2min rem

[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  31 out of  31 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  34 out of  34 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  41 out of  4

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:    5.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  34 out of  34 | elapsed:    5.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:    5.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    5.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:    6.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed:    6.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed:    6.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    6.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  41 out of  41 | elapsed:    6.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  42 out of  42 | elapsed:    6.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  43 out of  43 | elapsed:    7.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  44 out of  4

[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  3.9min remaining:    0.0s
building tree 27 of 100
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  4.1min remaining:    0.0s
building tree 28 of 100
[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:  4.2min remaining:    0.0s
building tree 29 of 100
[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:  4.4min remaining:    0.0s
building tree 30 of 100
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  4.6min remaining:    0.0s
building tree 31 of 100
[Parallel(n_jobs=1)]: Done  31 out of  31 | elapsed:  4.7min remaining:    0.0s
building tree 32 of 100
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:  4.9min remaining:    0.0s
building tree 33 of 100
[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:  5.0min remaining:    0.0s
building tree 34 of 100
[Parallel(n_jobs=1)]: Done  34 out of  34 | elapsed:  5.2min remaining:    0.0s
building tree 35 of 100
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  5.3min rem

KeyboardInterrupt: 

# XGBoost Tunned

In [None]:
xgb_params = {'n_estimators': 10000,
              'learning_rate': 0.25,
              'subsample': 0.926,
              'colsample_bytree': 0.84,
              'max_depth': 2,
              'booster': 'gbtree', 
              'reg_lambda': 45.1,
              'reg_alpha': 34.9,
              'random_state': 42,
              'n_jobs': 4}