# Import Packages

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set(style='darkgrid')
import math
import itertools
%matplotlib inline
%config IPCompleter.greedy=True

# load data
df = pd.read_csv('df_model_trimmed.csv')

# regressors
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

# classifiers
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Load Data

In [0]:
df = pd.read_csv('df_model_trimmed.csv')

In [0]:
# # drop the 'class' dummies
droplist = ['class_bonds','class_cnstr_engr','class_cpr','class_fire','class_health',
            'class_mac','class_mahl','class_motor','class_others','class_pa',
            'class_prof_indm','class_pub_lia','class_wic']
df.drop(columns=droplist,inplace=True)
df.shape

(4347, 20)

In [0]:
df['classification'] = [0 if i<0 else 1 for i in df['auwgr']]

In [0]:
df['year'].value_counts().sort_index()

2005    311
2006    312
2007    308
2008    301
2009    307
2010    302
2011    328
2012    328
2013    313
2014    296
2015    298
2016    310
2017    316
2018    317
Name: year, dtype: int64

In [0]:
df.describe()[['prem_write_net_lag1']]

Unnamed: 0,prem_write_net_lag1
count,4347.0
mean,3098902.0
std,4430762.0
min,490.0
25%,441273.7
50%,1366135.0
75%,3918170.0
max,33936220.0


# Linear Regression on each Insurance Class

In [0]:
df = pd.read_csv('df_model_trimmed.csv')
df['classification'] = [0 if i<0 else 1 for i in df['auwgr']]

# df = df[df['prem_write_net_lag1']>1.366135e+06]

# df = df[df['year']==2018]
# df = df[df['year']==2017]
# df = df[df['year']==2016]
# df = df[df['year']==2015]
# df = df[df['year']==2014]
# df = df[df['year']==2013]
# df = df[df['year']==2012]
# df = df[df['year']==2011]
# df = df[df['year']==2010]
# df = df[df['year']==2009]
# df = df[df['class_bonds']==1]
# df = df[df['class_cnstr_engr']==1]
# df = df[df['class_cpr']==1]
# df = df[df['class_fire']==1]
# df = df[df['class_health']==1]
# df = df[df['class_mac']==1]
# df = df[df['class_mahl']==1]
# df = df[df['class_motor']==1]
# df = df[df['class_others']==1]
# df = df[df['class_pa']==1]
# df = df[df['class_prof_indm']==1]
# df = df[df['class_pub_lia']==1]
# df = df[df['class_wic']==1]
df.shape
features = [col for col in df._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
X = df[features]
y = df['auwgr']
print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())

(4347, 31)
(4347,)
(3260, 31) (1087, 31)
(3260,) (1087,)

LINEAR REG cross-val mean score:
X-Val score MEAN using X_train		 0.431461374782723


With the above section code, we take turns to isolate the year and class of insurance and then run the linear regressor to see the cross_val_score. The score does not exceed 0.44. Therefore, even on a insurance class-by-class basis, it appears that the data points cannot be regressed. 


# Get Features

In [0]:
features = [col for col in df._get_numeric_data().columns if (col != 'auwgr') and (col != 'year')]
print(features)

['lkpp', 'hlr_lag1', 'hlr_lag2', 'hlr_lag3', 'hlr_lag4', 'hlr_lag5', 'mer', 'der', 'oer', 'prem_write_net_lag1', 'claim_set_net_lag1', 'exp_management_lag1', 'exp_comm_incur_net_lag1', 'exp_other_lag1', 'prem_liab_diff_lag1', 'claim_liab_diff_lag1', 'uw_gain_lag1', 'classification']


In [0]:
X = df[features]
y = df['auwgr']
print(X.shape)
print(y.shape)

(4347, 18)
(4347,)


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

(3260, 18) (1087, 18)
(3260,) (1087,)


In [0]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [0]:
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())


LINEAR REG cross-val mean score:
X-Val score MEAN using X_train		 0.4328202717008455


# RFECV - Linear Regression

In [0]:
estimator = linreg
selector = RFECV(estimator, step=1, cv=3) # 3 folds for CV, selector will check all num of features
selector = selector.fit(X_train_sc, y_train)
selector.ranking_

array([4, 1, 1, 2, 1, 3, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## Use selected features

In [0]:
selected_features = features.copy()
selected_features.remove('lkpp')
selected_features.remove('hlr_lag3')
selected_features.remove('hlr_lag5')
selected_features.remove('oer')

X = df[selected_features]
y = df['auwgr']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [0]:
linreg = LinearRegression()
print('\nLINEAR REG cross-val mean score:')
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_train_sc, y_train,cv=3).mean())
print('X-Val score MEAN using X_train\t\t',cross_val_score(linreg, X_test_sc, y_test,cv=3).mean())


LINEAR REG cross-val mean score:
X-Val score MEAN using X_train		 0.43328519227600687
X-Val score MEAN using X_train		 0.4552348633216165


# RFECV - Extra Trees Regression

In [0]:
X = df[features]
y = df['auwgr']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

(3260, 18) (1087, 18)
(3260,) (1087,)


In [0]:
estimator = ExtraTreesRegressor(bootstrap=True,oob_score=True,warm_start=False,n_estimators=100)
selector = RFECV(estimator, step=1, cv=3) # 3 folds for CV, selector will check all num of features
selector = selector.fit(X_train_sc, y_train)
selector.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## Use selected features

In [0]:
selected_features = features.copy()
selected_features.remove('oer')

X = df[selected_features]
y = df['auwgr']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [0]:
etreg = ExtraTreesRegressor(bootstrap=True,oob_score=True,warm_start=False,n_estimators=100)
etreg.fit(X_train_sc,y_train) # Use un-scaled data
# Evaluate model
print(etreg.score(X_train_sc,y_train))
print(etreg.score(X_test_sc,y_test))

0.9300579919251896
0.540305699955996


# RFECV - Gradient Boosting Regression

In [0]:
X = df[features]
y = df['auwgr']
X_full_train, X_full_test, y_full_train, y_full_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
print(X_full_train.shape,X_full_test.shape)
print(y_full_train.shape,y_full_test.shape)
ss = StandardScaler()
ss.fit(X_full_train)
X_full_train_sc = ss.transform(X_full_train)
X_full_test_sc = ss.transform(X_full_test)

(3260, 18) (1087, 18)
(3260,) (1087,)


## loss function = 'ls'

In [0]:
estimator = GradientBoostingRegressor(loss='ls',random_state=42)
selector = RFECV(estimator, step=1, cv=3) # 3 folds for CV, selector will check all num of features
selector = selector.fit(X_full_train_sc, y_full_train)
print(selector.ranking_)

[1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1]


### Use selected features

In [0]:
selected_features = features.copy()
selected_features.remove('oer')

In [0]:
X = df[selected_features]
y = df['auwgr']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

gbreg = GradientBoostingRegressor(loss='ls',random_state=42)
gbreg.fit(X_train_sc,y_train)
print(gbreg.score(X_train_sc,y_train))
print(gbreg.score(X_test_sc,y_test))

(3260, 17) (1087, 17)
(3260,) (1087,)
0.6338478755036308
0.5252429724440713


## loss function = 'huber'

In [0]:
estimator = GradientBoostingRegressor(loss='huber',random_state=42)
selector = RFECV(estimator, step=1, cv=3) # 3 folds for CV, selector will check all num of features
selector = selector.fit(X_full_train_sc, y_full_train)
print(selector.ranking_)

[1 1 5 6 2 1 1 1 8 1 1 1 1 1 7 4 3 1]


### Use selected features

In [0]:
selected_features = features.copy()
selected_features.remove('hlr_lag2')
selected_features.remove('hlr_lag3')
selected_features.remove('hlr_lag4')
selected_features.remove('oer')
selected_features.remove('exp_other_lag1')
selected_features.remove('prem_liab_diff_lag1')
selected_features.remove('claim_liab_diff_lag1')

X = df[selected_features]
y = df['auwgr']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

gbreg = GradientBoostingRegressor(loss='huber',random_state=42)
gbreg.fit(X_train_sc,y_train)
print(gbreg.score(X_train_sc,y_train))
print(gbreg.score(X_test_sc,y_test))

0.5522876038194138
0.514953850134294


## loss function = 'lad'

In [0]:
estimator = GradientBoostingRegressor(loss='lad',random_state=42)
selector = RFECV(estimator, step=1, cv=3) # 3 folds for CV, selector will check all num of features
selector = selector.fit(X_full_train_sc, y_full_train)
print(selector.ranking_)

[1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1]


### Use selected features

In [0]:
selected_features = features.copy()
selected_features.remove('hlr_lag4')

X = df[selected_features]
y = df['auwgr']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

gbreg = GradientBoostingRegressor(loss='lad',random_state=42)
gbreg.fit(X_full_train_sc,y_full_train)
print(gbreg.score(X_full_train_sc,y_full_train))
print(gbreg.score(X_full_test_sc,y_full_test))

0.46471246469893757
0.4548242424453777


## loss function = 'quantile'

In [0]:
estimator = GradientBoostingRegressor(loss='quantile',random_state=42)
selector = RFECV(estimator, step=1, cv=3) # 3 folds for CV, selector will check all num of features
selector = selector.fit(X_full_train_sc, y_full_train)
print(selector.ranking_)

[1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [0]:
selected_features = features.copy()
selected_features.remove('hlr_lag4')

X = df[selected_features]
y = df['auwgr']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

gbreg = GradientBoostingRegressor(loss='quantile',random_state=42)
gbreg.fit(X_train_sc,y_train)
print(gbreg.score(X_train_sc,y_train))
print(gbreg.score(X_test_sc,y_test))

(3260, 17) (1087, 17)
(3260,) (1087,)
0.12713059595099296
0.05252308879937795


#RFECV - Support Vector Regression

In [0]:
estmator = SVR()
selector = RFECV(estimator, step=1, cv=3) # 3 folds for CV, selector will check all num of features
selector = selector.fit(X_full_train_sc, y_full_train)
print(selector.ranking_)

[1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [0]:
selected_features = features.copy()
selected_features.remove('hlr_lag4')

X = df[selected_features]
y = df['auwgr']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X['classification'], random_state=42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

svreg = SVR()
svreg.fit(X_train_sc,y_train)
print(svreg.score(X_train_sc,y_train))
print(svreg.score(X_test_sc,y_test))

(3260, 17) (1087, 17)
(3260,) (1087,)
0.5224360586271621
0.4767820216624766


# CLASSIFICATION ====================

- logistic reg, et, sv, gb, dt, ab, rf, knn

In [0]:
df = pd.read_csv('df_model_trimmed.csv')
df['classification'] = [1 if i<0 else 0 for i in df['auwgr']]
features = [col for col in df._get_numeric_data().columns if (col != 'auwgr') and (col != 'year') and (col != 'classification')]

In [11]:
X = df[features]
y = df['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape,X_test.shape)

(3260, 30) (1087, 30)


## Logistic Regression Classifier

In [12]:
params = [{'fit_intercept': [True, False],
           'C': [0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4],
           'tol': [0.00001,0.0001,0.001,0.01,0.1],
           'penalty': ['l2'],
          #  'penalty': ['l1','l2'],
           'max_iter': [5000],
          #  'solver': ['newton-cg','lbfgs','liblinear','sag','saga'] no go
           'solver': ['lbfgs','liblinear','sag','saga']
           }]

gs = GridSearchCV(LogisticRegression(), param_grid=params, cv=5)
gs.fit(X_train, y_train)
gs.best_params_

{'C': 1.0,
 'fit_intercept': True,
 'max_iter': 5000,
 'penalty': 'l2',
 'solver': 'sag',
 'tol': 0.1}

In [13]:
lr = LogisticRegression(C=1.0,fit_intercept=True,max_iter=5000,penalty='l2',solver='sag',tol=0.1)
lr.fit(X_train,y_train)
print('Score(train/test):',lr.score(X_train, y_train),',',lr.score(X_test, y_test))
# print('Score (test):\t',lr.score(X_test, y_test))
print('\n=== Classification Report  ================================')
# predict & evaluate
predictions = lr.predict(X_test)
print(classification_report(y_test,predictions,target_names=['Positive UWG (0)','Negative UWG (1)']))

Score(train/test): 0.7260736196319019 , 0.7203311867525299

                  precision    recall  f1-score   support

Positive UWG (0)       0.76      0.86      0.81       740
Negative UWG (1)       0.59      0.41      0.49       347

        accuracy                           0.72      1087
       macro avg       0.67      0.64      0.65      1087
    weighted avg       0.70      0.72      0.71      1087



## Extra Trees Classifier
---



### GridSearchCV #1

In [0]:
params = [{'n_estimators':[100,200,300],
               'max_depth':range(2,10),
               'min_samples_split':range(2,20),
               'oob_score':[True],
               'bootstrap':[True]
              }]

gs = GridSearchCV(ExtraTreesClassifier(), param_grid=params, cv=5)
gs.fit(X_train, y_train)
gs.best_params_

{'bootstrap': True,
 'max_depth': 9,
 'min_samples_split': 8,
 'n_estimators': 100,
 'oob_score': True}

In [0]:
etcf = ExtraTreesClassifier(criterion='gini',bootstrap=True,max_depth=9,min_samples_split=8,n_estimators=100,oob_score=True)
etcf.fit(X_train,y_train)
print('Score (train):\t',etcf.score(X_train, y_train))
print('Score (test):\t',etcf.score(X_test, y_test))
# predict & evaluate
predictions = etcf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print('TP:',tp,'\tTN:',tn)
print('FP:',fp,'\tFN:',fn)
print('F1:',f1_score(y_test, predictions))
print('Acc:',1-(fp+fn)/len(predictions))

Score (train):	 0.7414110429447853
Score (test):	 0.7056117755289788
TP: 719 	TN: 48
FP: 299 	FN: 21
F1: 0.8179749715585893
Acc: 0.7056117755289788


### GridSearchCV #2

In [0]:
params = [{'n_estimators':[50,100,150],
               'max_depth':range(9,14),
               'min_samples_split':range(5,15),
               'oob_score':[True],
               'bootstrap':[True]
              }]

gs = GridSearchCV(ExtraTreesClassifier(), param_grid=params, cv=5)
gs.fit(X_train, y_train)
gs.best_params_

{'bootstrap': True,
 'max_depth': 13,
 'min_samples_split': 10,
 'n_estimators': 100,
 'oob_score': True}

In [0]:
etcf = ExtraTreesClassifier(bootstrap=True,max_depth=13,min_samples_split=10,n_estimators=100,oob_score=True)
etcf.fit(X_train,y_train)
print('Score (train):\t',etcf.score(X_train, y_train))
print('Score (test):\t',etcf.score(X_test, y_test))
# predict & evaluate
predictions = etcf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print('TP:',tp,'\tTN:',tn)
print('FP:',fp,'\tFN:',fn)
print('F1:',f1_score(y_test, predictions))
print('Acc:',1-(fp+fn)/len(predictions))

Score (train):	 0.7797546012269939
Score (test):	 0.7295308187672493
TP: 703 	TN: 90
FP: 257 	FN: 37
F1: 0.8270588235294118
Acc: 0.7295308187672493


### GridSearchCV #3

In [0]:
params = [{'n_estimators':[100,150,200],
               'max_depth':range(12,30),
               'min_samples_split':range(5,30),
               'oob_score':[True],
               'bootstrap':[True]
              }]

gs = GridSearchCV(ExtraTreesClassifier(), param_grid=params, cv=5)
gs.fit(X_train, y_train)
gs.best_params_

{'bootstrap': True,
 'max_depth': 24,
 'min_samples_split': 21,
 'n_estimators': 100,
 'oob_score': True}

In [0]:
etcf = ExtraTreesClassifier(bootstrap=True,max_depth=24,min_samples_split=21,n_estimators=100,oob_score=True)
etcf.fit(X_train,y_train)
print('Score (train):\t',etcf.score(X_train, y_train))
print('Score (test):\t',etcf.score(X_test, y_test))
# predict & evaluate
predictions = etcf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print('TP:',tp,'\tTN:',tn)
print('FP:',fp,'\tFN:',fn)
print('F1:',f1_score(y_test, predictions))
print('Acc:',1-(fp+fn)/len(predictions))

Score (train):	 0.811963190184049
Score (test):	 0.7442502299908004
TP: 694 	TN: 115
FP: 232 	FN: 46
F1: 0.8331332533013205
Acc: 0.7442502299908004


### GridSearchCV #4

In [0]:
params = [{'n_estimators':[120,150],
               'max_depth':range(24,35),
               'min_samples_split':range(15,30),
               'oob_score':[True],
               'bootstrap':[True]
              }]

gs = GridSearchCV(ExtraTreesClassifier(), param_grid=params, cv=5)
gs.fit(X_train, y_train)
gs.best_params_

{'bootstrap': True,
 'max_depth': 31,
 'min_samples_split': 27,
 'n_estimators': 150,
 'oob_score': True}

In [0]:
etcf = ExtraTreesClassifier(bootstrap=True,max_depth=31,min_samples_split=27,n_estimators=150,oob_score=True)
etcf.fit(X_train,y_train)
print('Score (train):\t',etcf.score(X_train, y_train))
print('Score (test):\t',etcf.score(X_test, y_test))
# predict & evaluate
predictions = etcf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print('TP:',tp,'\tTN:',tn)
print('FP:',fp,'\tFN:',fn)
print('F1:',f1_score(y_test, predictions))
print('Acc:',1-(fp+fn)/len(predictions))

Score (train):	 0.8024539877300614
Score (test):	 0.7359705611775529
TP: 689 	TN: 111
FP: 236 	FN: 51
F1: 0.8276276276276276
Acc: 0.7359705611775529


In [0]:
score_list=[]
for i in np.arange(0,100,1):
  if i%50==0:
    print(i)
  else:
    print(i,end=' ')
  etcf = ExtraTreesClassifier(bootstrap=True,max_depth=13,min_samples_split=10,n_estimators=100,oob_score=True,random_state=i)
  etcf.fit(X_train,y_train)
  score_list.append(etcf.score(X_test, y_test))
print('\n\n',len(score_list),':',min(score_list),max(score_list))

0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 

 100 : 0.7240110395584176 0.7479300827966882


In [0]:
# predict & evaluate
predictions = etcf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print('TP:',tp,'\tTN:',tn)
print('FP:',fp,'\tFN:',fn)
print('F1:',f1_score(y_test, predictions))
print('Acc:',1-(fp+fn)/len(predictions))

TP: 707 	TN: 91
FP: 256 	FN: 33
F1: 0.8302994715208455
Acc: 0.734130634774609


## Support Vector Classifier


Kernels to evaluate:
- rbf
- poly (degree=2,3,4,5)
- sigmoid

In [0]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [0]:
params = [{'C':[0.3,0.5,0.9],
               'gamma':['auto'],
               'kernel':['rbf','poly','sigmoid'],
               'tol':[0.1,0.01,0.001],
               'max_iter':[10000]
              }]

gs = GridSearchCV(SVC(), param_grid=params, cv=5)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'C': 0.9, 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': 10000, 'tol': 0.01}

In [53]:
svcf = SVC(C=0.9, kernel='rbf', gamma='auto', max_iter=10000, tol=0.01, random_state=42)
svcf.fit(X_train_sc,y_train)
# Evaluate Model
print(svcf.score(X_train_sc,y_train))
print(svcf.score(X_test_sc,y_test))

0.7662576687116565
0.7276908923643054


## Gradient Boosting Classifier

In [55]:
param_grid = {
    'n_estimators' : [100,150,200],
    'max_depth' : [2,3,4,5,6],
    'min_samples_leaf':[2,3,4],
    'learning_rate' : [0.03,0.05,0.07,0.1]
}
gs = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=3)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'learning_rate': 0.05,
 'max_depth': 3,
 'min_samples_leaf': 2,
 'n_estimators': 100}

In [56]:
gbcf=GradientBoostingClassifier(n_estimators=100, max_depth=3, min_samples_leaf=2, learning_rate=0.05)
gbcf.fit(X_train_sc,y_train)
# Evaluate Model
print(gbcf.score(X_train_sc,y_train))
print(gbcf.score(X_test_sc,y_test))

0.7745398773006135
0.7194112235510579


## K Nearest Neighbors Classifier

In [67]:
param_grid = {
    'n_neighbors' : [19,20,21,22,23,24,25,26,27,28],
    'weights' : ['uniform','distance'],
    'algorithm':['ball_tree','kd_tree','brute']
}
gs = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3)
gs.fit(X_train, y_train)
gs.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 19, 'weights': 'uniform'}

In [69]:
knncf = KNeighborsClassifier(n_neighbors=19,weights='uniform',algorithm='ball_tree')
knncf.fit(X_train,y_train)
# Evaluate model.
print('Score(train/test):',knncf.score(X_train, y_train),',',knncf.score(X_test, y_test))

Score(train/test): 0.7420245398773007 , 0.7212511499540019
