In [320]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer

In [321]:
adult_df = pd.read_csv("adult.txt")
covtype_df = pd.read_csv("covtype.txt",names=list(range(0,55)))
letter_df = pd.read_csv("letter-recognition.txt",names=list(range(0,17))) 
#census_df = pd.read_csv("census-income.txt")

In [322]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [323]:
imp_mean = SimpleImputer(missing_values=" ?", strategy="most_frequent").fit(adult_df)
adult_df = pd.DataFrame(imp_mean.transform(adult_df), columns=adult_df.columns)

# convert prediction data into binary labels
adult_df['dat'] = np.where(adult_df['dat'] == ' >50K',1,0)
adult_df.sample(15)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,dat
19121,44,Self-emp-inc,56651,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,United-States,1
15723,36,Private,130200,HS-grad,9,Divorced,Machine-op-inspct,Unmarried,White,Male,0,0,40,United-States,0
18530,44,Private,310255,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,60,United-States,1
15057,23,Private,242375,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,30,United-States,0
27074,42,Local-gov,255847,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
17620,36,Private,129150,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,20,United-States,1
3340,24,Private,215443,HS-grad,9,Separated,Other-service,Not-in-family,White,Male,0,0,40,United-States,0
656,70,Private,167358,9th,5,Widowed,Prof-specialty,Unmarried,White,Female,1111,0,15,United-States,0
19686,27,Self-emp-not-inc,207948,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0
24438,33,Private,112900,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,0,40,United-States,0


In [324]:
pred_column = covtype_df[54]
positive_label = pred_column.value_counts().index[0]

# convert prediction data into binary labels
pred_column = np.where(pred_column == positive_label, 1, 0)

covtype_df.drop(columns=[54],inplace=True)

In [325]:
scaler = MinMaxScaler().fit(covtype_df)
covtype_df = pd.DataFrame(scaler.transform(covtype_df),columns=covtype_df.columns)
covtype_df.insert(54,54,pred_column,True)
covtype_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
129222,0.36018,0.15,0.257576,0.150322,0.192506,0.168329,0.897638,0.799213,0.405512,0.054371,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
516196,0.596798,0.022222,0.287879,0.042949,0.25323,0.373612,0.76378,0.779528,0.53937,0.214415,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
528722,0.667334,0.519444,0.212121,0.5068,0.191214,0.119854,0.870079,0.980315,0.622047,0.291649,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
474551,0.622811,0.041667,0.106061,0.151754,0.251938,0.083041,0.84252,0.885827,0.582677,0.225986,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
387371,0.742371,0.469444,0.151515,0.125268,0.197674,0.414079,0.897638,0.96063,0.582677,0.086993,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
294730,0.588794,0.247222,0.333333,0.303508,0.135659,0.088942,0.972441,0.783465,0.279528,0.231842,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
143713,0.552276,0.263889,0.19697,0.173228,0.27907,0.517634,0.944882,0.870079,0.433071,0.288164,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
546575,0.730865,0.986111,0.121212,0.279885,0.31137,0.64929,0.814961,0.885827,0.61811,0.130211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
328923,0.262631,0.413889,0.378788,0.0,0.223514,0.073627,0.956693,0.905512,0.397638,0.179841,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
134430,0.522761,0.116667,0.287879,0.030064,0.242894,0.796965,0.866142,0.771654,0.413386,0.157396,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [326]:
letter_col,pred_column = letter_df[0], letter_df[16]
letter_df.drop(columns=[0,16],inplace=True)

In [327]:
scaler = MinMaxScaler().fit(letter_df)
letter_df = pd.DataFrame(scaler.transform(letter_df),columns=letter_df.columns)
letter_df.insert(0,0,letter_col,True)
letter_df.insert(16,16,pred_column,True)

# two ways to make binary classification labels
letter_df_p1 = np.where(letter_df[0] == 'O', 1, 0)
letter_df_p2 = np.where(letter_df[0].isin([chr(x) for x in range(ord('A'), ord('M') + 1)]), 1, 0)
letter_df.insert(17,17,letter_df_p1,True)
letter_df.insert(18,18,letter_df_p2,True)


letter_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
12578,Y,0.466667,0.6,0.466667,0.466667,0.266667,0.266667,0.6,0.133333,0.466667,0.666667,0.733333,0.4,0.133333,0.8,0.2,4,0,0
13621,C,0.2,0.333333,0.266667,0.2,0.133333,0.4,0.533333,0.466667,0.533333,0.533333,0.533333,0.866667,0.066667,0.6,0.266667,10,0,1
19586,E,0.133333,0.066667,0.133333,0.2,0.133333,0.466667,0.466667,0.333333,0.4,0.466667,0.4,0.6,0.133333,0.533333,0.333333,10,0,1
2500,A,0.133333,0.2,0.2,0.133333,0.066667,0.666667,0.133333,0.133333,0.066667,0.533333,0.133333,0.6,0.066667,0.4,0.066667,8,0,1
16327,N,0.2,0.4,0.266667,0.266667,0.133333,0.466667,0.466667,0.933333,0.133333,0.333333,0.4,0.533333,0.4,0.533333,0.0,8,0,0
14067,T,0.266667,0.466667,0.4,0.333333,0.4,0.466667,0.466667,0.266667,0.333333,0.466667,0.4,0.6,0.333333,0.533333,0.333333,7,0,0
19172,R,0.266667,0.533333,0.333333,0.4,0.4,0.466667,0.466667,0.466667,0.2,0.533333,0.333333,0.466667,0.266667,0.466667,0.466667,10,0,0
16323,X,0.333333,0.666667,0.4,0.533333,0.266667,0.466667,0.466667,0.266667,0.266667,0.466667,0.4,0.533333,0.2,0.533333,0.266667,8,0,0
14171,R,0.2,0.533333,0.266667,0.4,0.266667,0.333333,0.733333,0.466667,0.2,0.533333,0.266667,0.6,0.133333,0.4,0.333333,11,0,0
13800,N,0.266667,0.266667,0.333333,0.4,0.133333,0.466667,0.466667,0.933333,0.133333,0.266667,0.4,0.533333,0.4,0.533333,0.0,8,0,0


In [189]:
census_df.replace([' Not in universe', ' ?'], np.nan, inplace = True) 
census_df.dropna(axis = 1, inplace = True)

In [15]:
%%time
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


import warnings
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('classifier', RandomForestClassifier())])
search_space = [{'classifier': [LogisticRegression(solver='saga')],
                 'classifier__penalty': ['none','l1','l2'],
                 'classifier__C': np.logspace(-8, 4, 13)},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1,2,6,12, 20]}]
# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=2), verbose=0)
# Fit grid search
best_model = clf.fit(covtype_df.iloc[:5000,:-1], covtype_df.iloc[:5000,-1])

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "


Wall time: 2min 6s


In [328]:
X, Y = covtype_df.iloc[:10000,:-1], covtype_df.iloc[:10000,-1]

In [329]:
%%time 
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# take all our penguin data, and reserve 50% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    train_size=0.5,
                                                    random_state=0,
                                                    stratify=Y)


# Initializing Classifiers
clf1 = LogisticRegression(solver='saga',
                          random_state=0)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = SVC(random_state=0)

clf4 = RandomForestClassifier(random_state=0)

clf5 = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)

# clf6 = OrthogonalMatchingPursuit()

# Building the pipelines
pipe1 = Pipeline([('classifier', clf1)])

pipe2 = Pipeline([('classifier', clf2)])

pipe3 = Pipeline([('classifier', clf3)])

pipe4 = Pipeline([('classifier', clf4)])

pipe5 = Pipeline([('classifier', clf5)])

# pipe6 = Pipeline([('classifier', clf6)])


# Setting up the parameter grids
param_grid1 = [{'classifier__penalty': ['none', 'l1', 'l2'],
                'classifier__C': np.logspace(-8, 4, 13)}]

param_grid2 = [{'classifier__n_neighbors': np.arange(1, 100, 10)}]
#                 'classifier__p': [1, 2]}]

param_grid3 = [{'classifier__kernel': ['rbf'],
                'classifier__C': np.power(10., np.arange(-4, 4)),
                'classifier__gamma': np.power(10., np.arange(-5, 0))},
               {'classifier__kernel': ['linear'],                
                'classifier__C': np.power(10., np.arange(-4, 4))}]

param_grid4 = [{'classifier__n_estimators': [1024],
                'classifier__max_features': [1,2,4,6,8,12,16,20]}]

param_grid5 = [{'classifier__C': np.logspace(-8,4,13),
               'classifier__loss': ['hinge', 'squared_hinge']}]

# param_grid6 = [{'classifier__n_nonzero_coefs': [.01, .05, .1, .2, .6]}]

# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4,param_grid5),
                            (pipe1, pipe2, pipe3, pipe4, pipe5),
                            ('Logistic', 'KNN', 'SVM', 'RF', 'PAC')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=2, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv

Wall time: 23.9 ms


In [110]:
?accuracy_score

In [102]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you
warnings.filterwarnings('ignore')


cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

    #FYI: This code uses X_train.iloc[... ] instead of X_train[...] because the 
    # penguin data is in a Dataframe instead of a numpy matrix


outer fold 1/5 | tuning KNN      | inner ACC 88.85% | outer ACC 88.70%
outer fold 1/5 | tuning Logistic | inner ACC 87.78% | outer ACC 86.80%
outer fold 1/5 | tuning PAC      | inner ACC 87.15% | outer ACC 86.50%
outer fold 1/5 | tuning RF       | inner ACC 90.62% | outer ACC 90.70%
outer fold 1/5 | tuning SVM      | inner ACC 88.70% | outer ACC 88.20%
outer fold 2/5 | tuning KNN      | inner ACC 88.15% | outer ACC 88.90%
outer fold 2/5 | tuning Logistic | inner ACC 87.48% | outer ACC 87.40%
outer fold 2/5 | tuning PAC      | inner ACC 87.13% | outer ACC 87.30%
outer fold 2/5 | tuning RF       | inner ACC 90.72% | outer ACC 90.70%
outer fold 2/5 | tuning SVM      | inner ACC 88.95% | outer ACC 89.30%
outer fold 3/5 | tuning KNN      | inner ACC 88.22% | outer ACC 88.80%
outer fold 3/5 | tuning Logistic | inner ACC 87.28% | outer ACC 88.00%
outer fold 3/5 | tuning PAC      | inner ACC 87.05% | outer ACC 87.30%
outer fold 3/5 | tuning RF       | inner ACC 89.85% | outer ACC 92.10%
outer 

In [103]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

Logistic | outer CV acc. 87.40% +\- 0.400
KNN      | outer CV acc. 89.12% +\- 0.471
SVM      | outer CV acc. 88.94% +\- 0.441
RF       | outer CV acc. 91.00% +\- 0.559
PAC      | outer CV acc. 87.00% +\- 0.310

Logistic best parameters {'classifier__C': 100.0, 'classifier__penalty': 'l1'}
KNN best parameters {'classifier__n_neighbors': 11}
SVM best parameters {'classifier__C': 1000.0, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
RF best parameters {'classifier__max_features': 20, 'classifier__n_estimators': 1024}
PAC best parameters {'classifier__C': 0.1, 'classifier__loss': 'hinge'}


In [104]:
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

Accuracy 88.62% (average over CV test folds)
Best Parameters: {'classifier__C': 1000.0, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
Training Accuracy: 100.00%
Test Accuracy: 89.94%


In [105]:
X, Y = letter_df.iloc[:10000,1:-2], letter_df.iloc[:10000,-2]

In [106]:
%%time 
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# take all our penguin data, and reserve 50% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    train_size=0.5,
                                                    random_state=0,
                                                    stratify=Y)


# Initializing Classifiers
clf1 = LogisticRegression(solver='saga',
                          random_state=0)

clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = SVC(random_state=0)

clf4 = RandomForestClassifier(random_state=0)

clf5 = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)

# clf6 = OrthogonalMatchingPursuit()

# Building the pipelines
pipe1 = Pipeline([('classifier', clf1)])

pipe2 = Pipeline([('classifier', clf2)])

pipe3 = Pipeline([('classifier', clf3)])

pipe4 = Pipeline([('classifier', clf4)])

pipe5 = Pipeline([('classifier', clf5)])

# pipe6 = Pipeline([('classifier', clf6)])


# Setting up the parameter grids
param_grid1 = [{'classifier__penalty': ['none', 'l1', 'l2'],
                'classifier__C': np.logspace(-8, 4, 13)}]

param_grid2 = [{'classifier__n_neighbors': np.arange(1, 100, 10)}]
#                 'classifier__p': [1, 2]}]

param_grid3 = [{'classifier__kernel': ['rbf'],
                'classifier__C': np.power(10., np.arange(-4, 4)),
                'classifier__gamma': np.power(10., np.arange(-5, 0))},
               {'classifier__kernel': ['linear'],                
                'classifier__C': np.power(10., np.arange(-4, 4))}]

param_grid4 = [{'classifier__n_estimators': [1024],
                'classifier__max_features': [1,2,4,6,8,12,16,20]}]

param_grid5 = [{'classifier__C': np.logspace(-8,4,13),
               'classifier__loss': ['hinge', 'squared_hinge']}]

# param_grid6 = [{'classifier__n_nonzero_coefs': [.01, .05, .1, .2, .6]}]

# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4,param_grid5),
                            (pipe1, pipe2, pipe3, pipe4, pipe5),
                            ('Logistic', 'KNN', 'SVM', 'RF', 'PAC')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=2, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv

Wall time: 17.9 ms


In [107]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you
warnings.filterwarnings('ignore')


cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

    #FYI: This code uses X_train.iloc[... ] instead of X_train[...] because the 
    # penguin data is in a Dataframe instead of a numpy matrix


outer fold 1/5 | tuning KNN      | inner ACC 97.78% | outer ACC 98.60%
outer fold 1/5 | tuning Logistic | inner ACC 96.20% | outer ACC 96.20%
outer fold 1/5 | tuning PAC      | inner ACC 96.20% | outer ACC 96.20%
outer fold 1/5 | tuning RF       | inner ACC 98.38% | outer ACC 98.90%
outer fold 1/5 | tuning SVM      | inner ACC 97.97% | outer ACC 98.60%
outer fold 2/5 | tuning KNN      | inner ACC 98.23% | outer ACC 98.60%
outer fold 2/5 | tuning Logistic | inner ACC 96.20% | outer ACC 96.20%
outer fold 2/5 | tuning PAC      | inner ACC 96.20% | outer ACC 96.20%
outer fold 2/5 | tuning RF       | inner ACC 98.40% | outer ACC 98.50%
outer fold 2/5 | tuning SVM      | inner ACC 98.50% | outer ACC 98.10%
outer fold 3/5 | tuning KNN      | inner ACC 97.85% | outer ACC 98.30%
outer fold 3/5 | tuning Logistic | inner ACC 96.20% | outer ACC 96.20%
outer fold 3/5 | tuning PAC      | inner ACC 96.20% | outer ACC 96.20%
outer fold 3/5 | tuning RF       | inner ACC 98.38% | outer ACC 98.30%
outer 

In [108]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

Logistic | outer CV acc. 96.20% +\- 0.000
KNN      | outer CV acc. 98.68% +\- 0.264
SVM      | outer CV acc. 98.48% +\- 0.204
RF       | outer CV acc. 98.56% +\- 0.215
PAC      | outer CV acc. 96.20% +\- 0.000

Logistic best parameters {'classifier__C': 1e-08, 'classifier__penalty': 'none'}
KNN best parameters {'classifier__n_neighbors': 1}
SVM best parameters {'classifier__C': 1000.0, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
RF best parameters {'classifier__max_features': 6, 'classifier__n_estimators': 1024}
PAC best parameters {'classifier__C': 1e-08, 'classifier__loss': 'hinge'}


In [109]:
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

Accuracy 98.14% (average over CV test folds)
Best Parameters: {'classifier__C': 1000.0, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
Training Accuracy: 100.00%
Test Accuracy: 98.58%


In [111]:
best_algo.cv_results_

{'mean_fit_time': array([0.00999248, 0.00897574, 0.00997722, 0.00846028, 0.01097262,
        0.01098752, 0.00797904, 0.00797808, 0.01346684, 0.00997305]),
 'std_fit_time': array([1.98352337e-03, 9.96589661e-04, 1.54972076e-06, 5.14507294e-04,
        1.99687481e-03, 9.82999802e-04, 9.96708870e-04, 1.19209290e-07,
        3.46350670e-03, 9.97543335e-04]),
 'mean_score_time': array([0.16227698, 0.26413703, 0.28124559, 0.25383055, 0.24334741,
        0.2228924 , 0.30595279, 0.27277231, 0.36454248, 0.28488588]),
 'std_score_time': array([0.0022912 , 0.00365424, 0.04188406, 0.01446545, 0.01396132,
        0.00147712, 0.05162477, 0.00349283, 0.03889847, 0.01658368]),
 'param_classifier__n_neighbors': masked_array(data=[1, 11, 21, 31, 41, 51, 61, 71, 81, 91],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier__n_neighbors': 1},
  {'classifier__n_neighbors': 

In [22]:
len([covtype_df.iloc[5000:,:-1], covtype_df.iloc[5000:,-1]])

2

In [37]:
a = best_model.best_params_['classifier'].fit(covtype_df.iloc[:5000,:-1], covtype_df.iloc[:5000,-1]).predict(covtype_df.iloc[5000:,:-1])

In [35]:
np.mean(a)

0.0

In [16]:
best_model.best_params_

AttributeError: 'dict' object has no attribute 'predict'

In [9]:
# parameters = {'C':[1],'penalty':['l2']}
X = covtype_df.drop(columns=[0])
y = np.where(covtype_df[16] == 2,1,0)
log_reg = LogisticRegression()
clf = GridSearchCV(estimator=log_reg,param_grid=parameters)
clf.fit(X,y)
print(clf.score(X))

NameError: name 'parameters' is not defined

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score

X = adult_df[['workclass', 'marital-status', 'occupation', 'relationship']]
y = np.where(adult_df.iloc[:,-1] == ' >50K',1,0)

column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                                        ['workclass', 'marital-status', 'occupation']),
                                      (OrdinalEncoder(), ['relationship']),
                                      remainder='drop')
logreg = LogisticRegression()
pipe = make_pipeline(column_trans, logreg)
cross_val_score(pipe, X, y, cv=10, scoring='f1')


In [None]:
for i in range 3: 
    run_classifiers()

In [None]:
def run_classifiers(data):
    iterations = 3
    for i in range(iterations):
        # draw 5k samples for training data, and set aside the rest for testing 
        X_train, Y_train, X_test, Y_test = draw_samples(data)
        # returns the gridsearchCV model list thing
        gridcvs = create_gridsearch()
        
        select_best_and_fit()
        
        get_statistics()
        
        send_brain_to_file()

In [330]:
def draw_samples(data, n = 5000):
    train_index = random.sample(range(0,len(data)), n)
    # assumes target column is last column
    X_train, Y_train = data.iloc[train_index, :-1], data.iloc[train_index, -1]
    
    test = data[~data.index.isin(train_index)]
    X_test, Y_test = test.iloc[:, :-1], test.iloc[:, -1]

    return X_train, Y_train, X_test, Y_test

In [333]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
170611,0.521761,0.022222,0.196970,0.173228,0.193798,0.169594,0.803150,0.838583,0.570866,0.226683,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
518303,0.738369,0.352778,0.333333,0.430208,0.397933,0.060419,0.984252,0.866142,0.338583,0.043636,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
60556,0.569785,0.650000,0.090909,0.464567,0.355297,0.840382,0.826772,0.968504,0.681102,0.890004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213381,0.699350,0.583333,0.151515,0.242663,0.262274,0.492483,0.834646,0.984252,0.677165,0.251080,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243759,0.577289,0.213889,0.136364,0.095920,0.266150,0.160461,0.913386,0.877953,0.480315,0.289000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194480,0.679840,0.625000,0.060606,0.088762,0.220930,0.844457,0.842520,0.956693,0.653543,0.213300,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
434244,0.556278,0.641667,0.348485,0.042949,0.249354,0.206688,0.700787,1.000000,0.807087,0.147358,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
464199,0.645823,0.086111,0.212121,0.413028,0.395349,0.238022,0.850394,0.822835,0.496063,0.363446,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
455444,0.735368,0.855556,0.287879,0.060845,0.255814,0.496979,0.645669,0.881890,0.783465,0.230029,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [331]:
X_train, Y_train, X_test, Y_test = draw_samples(covtype_df)

In [359]:
def create_gridsearch():
    scoring = {'accuracy' : 'accuracy',
               'f1' : 'f1', 
               'roc_auc' : 'roc_auc',
              }#            'MCC' : make_scorer(matthews_corrcoef)}
    
    # Initializing Classifiers
    clf1 = LogisticRegression(solver='saga',
                              random_state=0)

    clf2 = KNeighborsClassifier(algorithm='ball_tree',
                                leaf_size=50)
    clf3 = SVC(random_state=0)

    clf4 = RandomForestClassifier(random_state=0)

    clf5 = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)


    # Building the pipelines
    pipe1 = Pipeline([('classifier', clf1)])

    pipe2 = Pipeline([('classifier', clf2)])

    pipe3 = Pipeline([('classifier', clf3)])

    pipe4 = Pipeline([('classifier', clf4)])

    pipe5 = Pipeline([('classifier', clf5)])



    # Setting up the parameter grids
    param_grid1 = [{'classifier__penalty': ['none', 'l1', 'l2'],
                    'classifier__C': np.logspace(-8, 4, 13)}]

    param_grid2 = [{'classifier__n_neighbors': np.geomspace(1, 500, num=25, dtype=int),
                    'classifier__weights': ['uniform', 'distance']}]

    param_grid3 = [{'classifier__kernel': ['rbf'],
                    'classifier__C': np.power(10., np.arange(-7, 4)),
                    'classifier__gamma': [0.001,0.005,0.01,0.05,0.1,0.5,1,2]},
                   {'classifier__kernel': ['linear'],                
                    'classifier__C': np.power(10., np.arange(-7, 4))},
                   {'classifier__kernel': ['polynomial'],
                    'classifier__degree': [2,3],
                    'classifier__C': np.power(10., np.arange(-7, 4))}]

    param_grid4 = [{'classifier__n_estimators': [1024],
                    'classifier__max_features': [1,2,4,6,8,12,16, 20]}]

    param_grid5 = [{'classifier__C': np.logspace(-8,4,13),
                    'classifier__loss': ['hinge', 'squared_hinge']}]


    # Setting up multiple GridSearchCV objects, 1 for each algorithm
    gridcvs = {}

    for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4,param_grid5),
                                (pipe1, pipe2, pipe3, pipe4, pipe5),
                                ('Logistic', 'KNN', 'SVM', 'RF', 'PAC')):
        gcv = GridSearchCV(estimator=est,
                           param_grid=pgrid,
                           scoring='accuracy', #scoring
                           n_jobs=1,
                           cv=5, 
                           verbose=0,
                           refit=True)
        gridcvs[name] = gcv
    return gridcvs

In [None]:
zip((param_grid1, param_grid2, param_grid3, param_grid4,param_grid5),
                                (pipe1, pipe2, pipe3, pipe4, pipe5),
                                ('Logistic', 'KNN', 'SVM', 'RF', 'PAC')):

In [360]:
gridcvs = create_gridsearch()

PAC


In [None]:
appapppoopoo

In [None]:
appapppoopoo = run_gridsearch(gridcvs, X_train, Y_train)

outer fold 1/5 | tuning PAC      | inner ACC 85.40% | outer ACC 85.30%

outer fold 2/5 | tuning PAC      | inner ACC 85.38% | outer ACC 85.40%

outer fold 3/5 | tuning PAC      | inner ACC 85.38% | outer ACC 85.40%

outer fold 4/5 | tuning PAC      | inner ACC 85.38% | outer ACC 85.40%

outer fold 5/5 | tuning PAC     

In [247]:
for name, gs_est in sorted(gridcvs.items()):
    print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

    # The inner loop for hyperparameter tuning
    gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
    y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
    acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
    print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
          (gs_est.best_score_ * 100, acc * 100))
    cv_scores[name].append(acc)
    print()

In [279]:
def run_gridsearch(gridcvs, X_train, Y_train):
    warnings.filterwarnings('ignore')
    
    cv_scores = {name: [] for name, gs_est in gridcvs.items()}

    skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

    # The outer loop for algorithm selection
    c = 1
    for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
        for name, gs_est in sorted(gridcvs.items()):
            print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

            # The inner loop for hyperparameter tuning
            gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
            y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
            acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
            print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
                  (gs_est.best_score_ * 100, acc * 100))
            cv_scores[name].append(acc)

        c += 1
    return cv_scores

In [289]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [291]:
gridcvs.items()

dict_items([('P', GridSearchCV(cv=5, estimator=PassiveAggressiveClassifier(random_state=0),
             n_jobs=1,
             param_grid={'classifier__C': array([1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01,
       1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04]),
                         'classifier__loss': ['hinge', 'squared_hinge']},
             refit=False, scoring='accuracy'))])

In [290]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you
warnings.filterwarnings('ignore')


cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

    #FYI: This code uses X_train.iloc[... ] instead of X_train[...] because the 
    # penguin data is in a Dataframe instead of a numpy matrix


outer fold 1/5 | tuning P       

ValueError: Invalid parameter classifier for estimator PassiveAggressiveClassifier(random_state=0). Check the list of available parameters with `estimator.get_params().keys()`.

In [215]:
?GridSearchCV