# Model optimization lads

In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn import svm
from sklearn import naive_bayes
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV


data = pd.read_csv('./data/glass.csv')
features = list(data)
features.remove('Type')
data_x = data[features]
data_y = data['Type']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, random_state=4)

## 1. Basic cross-validation with SVM

In [2]:
mod = svm.SVC(C=2.5)

#Illustrate the 3 major CV approaches. We will use accuracy or F1 macro as our scoring criteria.
k_fold = KFold(n_splits=5, shuffle=True, random_state=4)
k_fold_scores = cross_val_score(mod, data_x, data_y, scoring='f1_macro', cv=k_fold)
print('CV Scores (F1 Macro for K-Fold): ' + str(k_fold_scores))

CV Scores (F1 Macro for K-Fold): [0.77591398 0.66248956 0.47710502 0.62471805 0.62903226]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [7]:
import numpy as np
loo = LeaveOneOut()
loo_scores = cross_val_score(mod, data_x, data_y, cv=loo)
print('CV Scores (Leave-one-out): ' + str(np.mean(loo_scores)))













CV Scores (Leave-one-out): 0.7102803738317757




In [9]:
shuf = ShuffleSplit(test_size = 0.2, train_size = 0.8, n_splits = 10)
ss_scores = cross_val_score(mod, data_x, data_y, cv=shuf, scoring = 'accuracy')
print('CV Scores (Shuffle Split): ' + str(ss_scores))

CV Scores (Shuffle Split): [0.65116279 0.74418605 0.76744186 0.62790698 0.60465116 0.74418605
 0.6744186  0.58139535 0.76744186 0.6744186 ]




In [17]:
#Grid search and cross validation with random forest
param_grid = {'n_estimators':[5,10,50,100], 'max_depth':[3,6,None]}

#construct searching object
optimized_rf = GridSearchCV(ensemble.RandomForestClassifier(), param_grid, cv=5, scoring = 'accuracy')

#fit to find best model
optimized_rf.fit(x_train, y_train)

#evaluate
print('Grid search test score (Random Forest): ' +str(optimized_rf.score(x_test, y_test)))

Grid search test score (Random Forest): 0.8769230769230769


