In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 5. Model Selection

In [None]:
train_data = pd.read_csv('train_data_processed.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,HOME_VAL,TRAVTIME,BLUEBOOK,TIF,...,CAR_TYPE_Sports Car,CAR_TYPE_Van,OCCUPATION_Clerical,OCCUPATION_Doctor,OCCUPATION_Home Maker,OCCUPATION_Lawyer,OCCUPATION_Manager,OCCUPATION_Professional,OCCUPATION_Student,TGT_CLAIM_FLAG
0,0,-0.388611,-0.623442,0.188397,-2.525708,-1.270714,-1.133206,-0.199425,-1.190996,1.53652,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,1,-0.388611,0.199783,-0.733635,-1.054674,0.048452,-1.133206,-0.669646,0.530708,1.277411,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,-0.388611,-1.446667,1.110429,-2.525708,-1.270714,-1.133206,-0.333774,-0.381143,-1.054565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,3,-0.388611,-0.153028,2.032461,-2.525708,-1.270714,-1.133206,1.88298,-0.329529,1.277411,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,4,-0.388611,-1.799478,2.032461,1.397051,0.16545,0.602948,1.144062,-1.670271,-1.054565,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
train_data.shape

(11290, 35)

In [None]:
train_data.drop(columns = 'Unnamed: 0' , inplace = True)

In [None]:
train_x = train_data.iloc[: , :-1]
train_y = train_data.iloc[: , -1]

In [None]:
train_x.shape, train_y.shape

((11290, 33), (11290,))

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFE

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from xgboost import XGBClassifier

classifiers = [
    ('Linear SVM', LinearSVC(random_state=0, tol=1e-5)),
    ('RBF SVM', SVC(random_state=0, gamma=2, C=1)),
    ('Decision Tree', DecisionTreeClassifier(random_state=0, max_depth=5)),
    ('Random Forest', RandomForestClassifier(random_state=0, max_depth=5, n_estimators=10, max_features=1)),
    ('AdaBoost', AdaBoostClassifier(random_state=0)),
    ('XGBoost', XGBClassifier(random_state=0)),
]

In [None]:
# Splitting the data into cv and train set
from sklearn.model_selection import train_test_split 
x_train , x_cv , y_train, y_cv = train_test_split(train_x , train_y , test_size = 0.2 , random_state = 0)

In [None]:
performance = []
for name, clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_cv)
    perf_tuple = (name, f1_score(y_cv, y_pred, average='weighted'))
    #print(perf_tuple)
    performance.append(perf_tuple)



In [None]:
performance_

[('Linear SVM', 0.7389591789187615),
 ('RBF SVM', 0.7415885180470208),
 ('Decision Tree', 0.7624063102431518),
 ('Random Forest', 0.7871814024066975),
 ('AdaBoost', 0.8507315360344145),
 ('XGBoost', 0.8519224486205619)]

In [None]:
def sortSecond(val):
    return val[1]

performance.sort(key=sortSecond, reverse=True)

In [None]:
performance            # Choose XGBoost as the key model

[('XGBoost', 0.8519224486205619),
 ('AdaBoost', 0.8507315360344145),
 ('Random Forest', 0.7871814024066975),
 ('Decision Tree', 0.7624063102431518),
 ('RBF SVM', 0.7415885180470208),
 ('Linear SVM', 0.7389591789187615)]

## 7. Final Model and hyper-parameter tuning

In [None]:
param = {
            'max_depth':range(3,12,1),
            'min_child_weight':range(1,12,1),
            'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
            'learning_rate' : [0.1 , 0.01, 0.001, 1, 10, 0.0001 ],
            'gamma' : [0.001, 0.01, 0.1, 1, 10, 100],
}


In [None]:
from sklearn.model_selection import RandomizedSearchCV
rnd_search = RandomizedSearchCV( XGBClassifier(n_estimators = 1000), 
                                random_state=0, 
                                param_distributions = param, 
                                n_iter =10, 
                                cv=5)

rnd_search.fit(x_train, y_train)

RandomizedSearchCV(cv=5, estimator=XGBClassifier(n_estimators=1000),
                   param_distributions={'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                                        'learning_rate': [0.1, 0.01, 0.001, 1,
                                                          10, 0.0001],
                                        'max_depth': range(3, 12),
                                        'min_child_weight': range(1, 12),
                                        'reg_alpha': [0, 0.001, 0.005, 0.01,
                                                      0.05]},
                   random_state=0)

In [None]:
rnd_search.best_params_

{'reg_alpha': 0.05,
 'min_child_weight': 7,
 'max_depth': 8,
 'learning_rate': 0.01,
 'gamma': 0.1}

In [None]:
rnd_search.best_score_

0.8520818816452076