# Machine Learning

## Candidate Number: 178621 & Kaggle Team Name: Kristina Harper

### Code File 1

This notebook was used to generate models and predictions, both with the orignial train data and additional train data with confidence measures and proportional sample weighting. 

Grid search was run here to optimize hyperparameters. Due to the expensive and time-consuming nature of running grid search using a pipeline, hyperparameter optimization was undertaken using multiple runs of the grid search function to keep the explored feature space with cv to a minimum. Not all of the grid search outputs were saved, but an example can be found below. 

In [1]:
# Packages for analysis
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

# Packages for visuals
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)

# Allows charts to appear in the notebook
%matplotlib inline

# Pickle package
import pickle
import collections
from collections import defaultdict,Counter
import csv
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import KFold
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import ShuffleSplit

In [28]:
from sklearn.externals import joblib
import pickle

In [2]:
# Create a dataframe from the csv data
trainData = pd.read_csv('training.csv')
# Create a dataframe from the csv data
testData = pd.read_csv('testing.csv')

In [3]:
# Load confidence measures and additional training data
confidenceMeasures = pd.read_csv('annotation_confidence.csv')
#confidenceMeasures
#trainConfidence = confidenceMeasures[0:456]
additionalTraining = pd.read_csv('additional_training.csv')

allData = trainData.append(additionalTraining) # code to append two dataframes.
allData

Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction
0,1,0.449180,0.000000,0.000000,0.00000,0.000000,0.110520,1.317000,0.000000,1.434300,...,0.005435,0.002507,0.037413,0.047131,0.004175,0.003744,0.037672,0.071448,0.013265,1
1,2,0.000000,0.000000,0.000000,0.00000,0.000000,0.631840,0.000000,1.838800,0.495900,...,0.033147,0.008112,0.004126,0.014677,0.048980,0.011394,0.012629,0.033668,0.048248,1
2,3,0.000000,0.000000,0.000000,0.28339,0.000000,0.749490,0.062996,1.600200,0.740900,...,0.023006,0.025229,0.043951,0.032144,0.018992,0.033419,0.030518,0.022346,0.014875,1
3,4,0.311980,0.244520,0.212100,0.97855,0.000000,1.319800,0.000000,0.000000,0.000000,...,0.033687,0.066910,0.036916,0.029357,0.017351,0.020543,0.015300,0.016477,0.019715,0
4,5,0.000000,0.285600,0.067485,0.00000,0.441400,0.000000,0.539600,0.000000,0.091620,...,0.014306,0.023978,0.019834,0.029528,0.029826,0.027222,0.032496,0.026370,0.028569,0
5,6,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,2.309600,0.812570,0.000000,...,0.006797,0.030276,0.037172,0.019828,0.010732,0.053016,0.041817,0.012208,0.007540,1
6,7,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.488860,0.826320,0.368780,...,0.027909,0.024642,0.011153,0.022865,0.016119,0.028122,0.017835,0.017970,0.016795,1
7,8,0.000000,0.000000,0.032417,0.00000,0.132620,0.531190,0.441750,0.000000,1.615700,...,0.008367,0.009989,0.036080,0.026765,0.009598,0.006808,0.033180,0.026256,0.017007,1
8,9,0.233590,0.000000,0.000000,0.00000,0.000000,0.000000,0.130800,0.000000,0.000000,...,0.041533,0.004420,0.053275,0.045695,0.015790,0.001562,0.025589,0.048539,0.027832,0
9,10,0.267510,0.000000,0.000000,0.61004,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.025733,0.025188,0.027871,0.027146,0.028338,0.021368,0.042870,0.038707,0.017392,1


### Imputation of Additional Data

In [4]:
meanData = allData.mean()
allData = allData.fillna(meanData)
allData #now data has been filled out with mean of columns

Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction
0,1,0.449180,0.000000,0.000000,0.000000,0.000000,0.110520,1.317000,0.000000,1.434300,...,0.005435,0.002507,0.037413,0.047131,0.004175,0.003744,0.037672,0.071448,0.013265,1
1,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.631840,0.000000,1.838800,0.495900,...,0.033147,0.008112,0.004126,0.014677,0.048980,0.011394,0.012629,0.033668,0.048248,1
2,3,0.000000,0.000000,0.000000,0.283390,0.000000,0.749490,0.062996,1.600200,0.740900,...,0.023006,0.025229,0.043951,0.032144,0.018992,0.033419,0.030518,0.022346,0.014875,1
3,4,0.311980,0.244520,0.212100,0.978550,0.000000,1.319800,0.000000,0.000000,0.000000,...,0.033687,0.066910,0.036916,0.029357,0.017351,0.020543,0.015300,0.016477,0.019715,0
4,5,0.000000,0.285600,0.067485,0.000000,0.441400,0.000000,0.539600,0.000000,0.091620,...,0.014306,0.023978,0.019834,0.029528,0.029826,0.027222,0.032496,0.026370,0.028569,0
5,6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.309600,0.812570,0.000000,...,0.006797,0.030276,0.037172,0.019828,0.010732,0.053016,0.041817,0.012208,0.007540,1
6,7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.488860,0.826320,0.368780,...,0.027909,0.024642,0.011153,0.022865,0.016119,0.028122,0.017835,0.017970,0.016795,1
7,8,0.000000,0.000000,0.032417,0.000000,0.132620,0.531190,0.441750,0.000000,1.615700,...,0.008367,0.009989,0.036080,0.026765,0.009598,0.006808,0.033180,0.026256,0.017007,1
8,9,0.233590,0.000000,0.000000,0.000000,0.000000,0.000000,0.130800,0.000000,0.000000,...,0.041533,0.004420,0.053275,0.045695,0.015790,0.001562,0.025589,0.048539,0.027832,0
9,10,0.267510,0.000000,0.000000,0.610040,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.025733,0.025188,0.027871,0.027146,0.028338,0.021368,0.042870,0.038707,0.017392,1


In [5]:
# Add confidence label to new all features dataframe, then can split all into training and 
# validation sets. 

allData['confidence'] = confidenceMeasures['confidence']
allData

Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction,confidence
0,1,0.449180,0.000000,0.000000,0.000000,0.000000,0.110520,1.317000,0.000000,1.434300,...,0.002507,0.037413,0.047131,0.004175,0.003744,0.037672,0.071448,0.013265,1,1.00
1,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.631840,0.000000,1.838800,0.495900,...,0.008112,0.004126,0.014677,0.048980,0.011394,0.012629,0.033668,0.048248,1,0.66
2,3,0.000000,0.000000,0.000000,0.283390,0.000000,0.749490,0.062996,1.600200,0.740900,...,0.025229,0.043951,0.032144,0.018992,0.033419,0.030518,0.022346,0.014875,1,1.00
3,4,0.311980,0.244520,0.212100,0.978550,0.000000,1.319800,0.000000,0.000000,0.000000,...,0.066910,0.036916,0.029357,0.017351,0.020543,0.015300,0.016477,0.019715,0,0.66
4,5,0.000000,0.285600,0.067485,0.000000,0.441400,0.000000,0.539600,0.000000,0.091620,...,0.023978,0.019834,0.029528,0.029826,0.027222,0.032496,0.026370,0.028569,0,1.00
5,6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.309600,0.812570,0.000000,...,0.030276,0.037172,0.019828,0.010732,0.053016,0.041817,0.012208,0.007540,1,0.66
6,7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.488860,0.826320,0.368780,...,0.024642,0.011153,0.022865,0.016119,0.028122,0.017835,0.017970,0.016795,1,0.66
7,8,0.000000,0.000000,0.032417,0.000000,0.132620,0.531190,0.441750,0.000000,1.615700,...,0.009989,0.036080,0.026765,0.009598,0.006808,0.033180,0.026256,0.017007,1,1.00
8,9,0.233590,0.000000,0.000000,0.000000,0.000000,0.000000,0.130800,0.000000,0.000000,...,0.004420,0.053275,0.045695,0.015790,0.001562,0.025589,0.048539,0.027832,0,0.66
9,10,0.267510,0.000000,0.000000,0.610040,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.025188,0.027871,0.027146,0.028338,0.021368,0.042870,0.038707,0.017392,1,1.00


In [6]:
# Create separate dataframes for confidence and prediction values, then drop those to generate 
# just-features. Used for grid search and hyperparameter optimization. 

allConf = confidenceMeasures['confidence']
allPred = allData.prediction
allDataFeats = allData.drop('prediction',axis=1)
allDataFeats=allDataFeats.drop('confidence',axis=1)
allDataFeats

Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,1,0.449180,0.000000,0.000000,0.000000,0.000000,0.110520,1.317000,0.000000,1.434300,...,0.024475,0.005435,0.002507,0.037413,0.047131,0.004175,0.003744,0.037672,0.071448,0.013265
1,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.631840,0.000000,1.838800,0.495900,...,0.027075,0.033147,0.008112,0.004126,0.014677,0.048980,0.011394,0.012629,0.033668,0.048248
2,3,0.000000,0.000000,0.000000,0.283390,0.000000,0.749490,0.062996,1.600200,0.740900,...,0.022123,0.023006,0.025229,0.043951,0.032144,0.018992,0.033419,0.030518,0.022346,0.014875
3,4,0.311980,0.244520,0.212100,0.978550,0.000000,1.319800,0.000000,0.000000,0.000000,...,0.064627,0.033687,0.066910,0.036916,0.029357,0.017351,0.020543,0.015300,0.016477,0.019715
4,5,0.000000,0.285600,0.067485,0.000000,0.441400,0.000000,0.539600,0.000000,0.091620,...,0.039439,0.014306,0.023978,0.019834,0.029528,0.029826,0.027222,0.032496,0.026370,0.028569
5,6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.309600,0.812570,0.000000,...,0.030265,0.006797,0.030276,0.037172,0.019828,0.010732,0.053016,0.041817,0.012208,0.007540
6,7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.488860,0.826320,0.368780,...,0.026708,0.027909,0.024642,0.011153,0.022865,0.016119,0.028122,0.017835,0.017970,0.016795
7,8,0.000000,0.000000,0.032417,0.000000,0.132620,0.531190,0.441750,0.000000,1.615700,...,0.020744,0.008367,0.009989,0.036080,0.026765,0.009598,0.006808,0.033180,0.026256,0.017007
8,9,0.233590,0.000000,0.000000,0.000000,0.000000,0.000000,0.130800,0.000000,0.000000,...,0.040261,0.041533,0.004420,0.053275,0.045695,0.015790,0.001562,0.025589,0.048539,0.027832
9,10,0.267510,0.000000,0.000000,0.610040,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.030685,0.025733,0.025188,0.027871,0.027146,0.028338,0.021368,0.042870,0.038707,0.017392


 ### Cross-validation set generation

In [7]:
# Create training and validation sets for the data - used for model training and validation. 
Train,Val = train_test_split(allData)

# Run to isolate predictions from train and test data splits
cvTrainPreds = Train.prediction
cvValPreds = Val.prediction

# Drop predictions to geenrate only train and test features with confidence column still there. 
cvTrainFeatsOnly = Train.drop('prediction',axis=1)
cvValFeatsOnly = Val.drop('prediction',axis=1)

# Get training and validation set confidence
cvTrainConf = Train.confidence
cvValConf = Val.confidence

# Drop the training and validation confidence columns
cvTrainFeatsOnly = cvTrainFeatsOnly.drop('confidence',axis=1)
cvValFeatsOnly = cvValFeatsOnly.drop('confidence',axis=1)

### Pre-processing

In [12]:
# Preprocessing steps: scaling, PCA
scaler = StandardScaler()
scaler.fit(cvTrainFeatsOnly)

# Apply transform to training and test set
scaledTrainCV = scaler.transform(cvTrainFeatsOnly)
scaledValCV = scaler.transform(cvValFeatsOnly)
scaledTestData = scaler.transform(testData)

# Make a PCA model with optimized number of components 
pca = PCA(n_components=200)

# Fit PCA on training set. 
pca.fit(scaledTrainCV)

# Find how many components chosen with pca.n_components_. 

# Apply pca mapping to training, validation, and test set. 
pcaTrainCV = pca.transform(scaledTrainCV)
pcaValCV = pca.transform(scaledValCV)
pcaTestCV = pca.transform(scaledTestData)

### Model Training, Validation, & Test Prediction generation

In [13]:
# Generates highest Kaggle Score, not highest cv score
model10 = svm.SVC(C = 10,kernel='linear',gamma = 0.0001,class_weight={0:0.5714,1:0.4286},verbose=2)
model10.fit(pcaTrainCV,cvTrainPreds,sample_weight=cvTrainConf)

newPredictions = model10.predict(pcaValCV)
print('newPred: ',newPredictions)
print('Score: ',model10.score(pcaValCV,cvValPreds))

testPredictions9 = model10.predict(pcaTestCV)
print(testPredictions9)

[LibSVM]newPred:  [0 1 1 ..., 1 0 0]
Score:  0.824561403509
[0 1 0 ..., 1 1 0]


In [23]:
# Run to replicate results from svm rbf all c=1, gamma = 0.0001, 200ncomponents. 
model11 = svm.SVC(C = 1,kernel='rbf',gamma = 0.0001,class_weight={0:0.5714,1:0.4286},verbose=3)
model11.fit(pcaTrainCV,cvTrainPreds,sample_weight=cvTrainConf)

newPredictions = model11.predict(pcaValCV)
print('newPred: ',newPredictions)
print('Score: ',model11.score(pcaValCV,cvValPreds))

testPredictions11 = model11.predict(pcaTestCV)
print(testPredictions11)

[LibSVM]newPred:  [0 1 1 ..., 1 0 0]
Score:  0.822807017544
[0 1 0 ..., 1 1 0]


In [24]:
# Write predictions to csv file for Kaggle upload
predDF = pd.DataFrame(testPredictions11)
predDF.index = predDF.index+1
predDF.to_csv('testPredictions11.csv')

### Grid Search using Pipeline for Hyper-parameter Optimization

In [9]:
# Create pipeline to implement preprocessing and classifier
pipeline = Pipeline([('scaler',StandardScaler()),('pca', PCA()),('classifier',svm.SVC())])

# Search space for hyperparameter values
searchSpace= [{'pca__n_components': [1,2,3,100,200],
               'classifier__C': [1,10],
               'classifier__kernel': ['linear','poly','sigmoid'],
               'classifier__gamma':[0.0001,0.001],
               'classifier__class_weight':[{0:0.5714,1:0.4286}]}]



# Create grid search
clf = GridSearchCV(pipeline,searchSpace, cv=5, verbose = 3, n_jobs=-1) #5 folds in stratified kfold

# Identify the best model based on above parameters
best_model = clf.fit(allDataFeats,allPred) 
best_model

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed: 29.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 39.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'pca__n_components': [1, 2, 3, 100, 200], 'classifier__C': [1, 10], 'classifier__kernel': ['linear', 'poly', 'sigmoid'], 'classifier__gamma': [0.0001, 0.001], 'classifier__class_weight': [{0: 0.5714, 1: 0.4286}]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [10]:
# Get the best estimator of the model generated with grid search 
best_model.best_estimator_

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=200, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classifier', SVC(C=10, cache_size=200, class_weight={0: 0.5714, 1: 0.4286}, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [11]:
# Get the best parameters generated through grid search 
best_model.best_params_

{'classifier__C': 10,
 'classifier__class_weight': {0: 0.5714, 1: 0.4286},
 'classifier__gamma': 0.0001,
 'classifier__kernel': 'linear',
 'pca__n_components': 200}

In [31]:
# Save the model, pca, and scaling undertaken for comprehensive testing later 
joblib.dump(scaler,'RBFModel11_scaler.pkl') 
joblib.dump(pca, 'RBFModel11_pca.pkl')
joblib.dump(model11,'RBFModel11_SVMtrained.pkl')
#To Load: joblib_model = joblib.load(joblib_file) #then can access parameters and run 

['RBFModel11_SVMtrained.pkl']