In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer

In [100]:
adult_df = pd.read_csv("adult.txt")
covtype_df = pd.read_csv("covtype.txt",names=list(range(0,55)))
letter_df = pd.read_csv("letter-recognition.txt",names=list(range(0,17))) 
#census_df = pd.read_csv("census-income.txt",names=list(range(0,42)))

In [101]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [102]:
imp_mean = SimpleImputer(missing_values=" ?", strategy="most_frequent").fit(adult_df)
adult_df = pd.DataFrame(imp_mean.transform(adult_df), columns=adult_df.columns)
adult_df = pd.get_dummies(adult_df, columns = ['workclass', 'education', 'marital-status', 'occupation', 
                                               'relationship', 'race', 'sex', 'native-country'])
# convert prediction data into binary labels
pred = np.where(adult_df['pred'] == ' >50K',1,0)
adult_df.drop(columns=['pred'], inplace=True)

# scale quantitative data
scaler = MinMaxScaler().fit(adult_df)
adult_df = pd.DataFrame(scaler.transform(adult_df),columns=adult_df.columns)
adult_df['pred'] = pred

adult_df.sample(10)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,education_ 7th-8th,education_ 9th,education_ Assoc-acdm,education_ Assoc-voc,education_ Bachelors,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital-status_ Divorced,marital-status_ Married-AF-spouse,marital-status_ Married-civ-spouse,marital-status_ Married-spouse-absent,marital-status_ Never-married,marital-status_ Separated,marital-status_ Widowed,occupation_ Adm-clerical,occupation_ Armed-Forces,occupation_ Craft-repair,occupation_ Exec-managerial,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,native-country_ Cambodia,native-country_ Canada,native-country_ China,native-country_ Columbia,native-country_ Cuba,native-country_ Dominican-Republic,native-country_ Ecuador,native-country_ El-Salvador,native-country_ England,native-country_ France,native-country_ Germany,native-country_ Greece,native-country_ Guatemala,native-country_ Haiti,native-country_ Holand-Netherlands,native-country_ Honduras,native-country_ Hong,native-country_ Hungary,native-country_ India,native-country_ Iran,native-country_ Ireland,native-country_ Italy,native-country_ Jamaica,native-country_ Japan,native-country_ Laos,native-country_ Mexico,native-country_ Nicaragua,native-country_ Outlying-US(Guam-USVI-etc),native-country_ Peru,native-country_ Philippines,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,pred
22534,0.054795,0.074111,0.666667,0.0,0.0,0.357143,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
12020,0.068493,0.124961,0.6,0.0,0.0,0.193878,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
31928,0.383562,0.021871,0.533333,0.0,0.0,0.44898,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
29703,0.232877,0.126962,0.533333,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
22503,0.575342,0.054961,0.8,0.0,0.0,0.214286,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
17519,0.328767,0.125513,0.6,0.0,0.0,0.193878,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
480,0.219178,0.067549,0.733333,0.0,0.0,0.44898,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2678,0.191781,0.071475,0.6,0.0,0.0,0.44898,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
12767,0.041096,0.055744,0.6,0.0,0.0,0.193878,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
26717,0.09589,0.120525,0.4,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [78]:
pred_column = covtype_df[54]
positive_label = pred_column.value_counts().index[0]

# convert prediction data into binary labels
pred_column = np.where(pred_column == positive_label, 1, 0)

covtype_df.drop(columns=[54],inplace=True)

In [79]:
scaler = MinMaxScaler().fit(covtype_df)
covtype_df = pd.DataFrame(scaler.transform(covtype_df),columns=covtype_df.columns)
covtype_df.insert(54,'pred',pred_column,True)
covtype_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,pred
541925,0.617809,0.936111,0.287879,0.121689,0.22739,0.537586,0.688976,0.826772,0.677165,0.128259,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
481407,0.58029,0.772222,0.333333,0.021475,0.239018,0.307995,0.614173,0.937008,0.862205,0.091733,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
19999,0.550275,0.008333,0.19697,0.077309,0.241602,0.797808,0.795276,0.846457,0.586614,0.516381,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
405284,0.764382,0.533333,0.151515,0.375089,0.465116,0.354925,0.862205,0.976378,0.637795,0.107208,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
253277,0.70085,0.369444,0.409091,0.601288,0.360465,0.169875,0.988189,0.850394,0.295276,0.363167,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
71020,0.548274,0.141667,0.318182,0.319256,0.273902,0.542504,0.885827,0.744094,0.346457,0.813885,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
288718,0.275138,0.802778,0.439394,0.221188,0.390181,0.080511,0.496063,0.870079,0.901575,0.243134,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
345987,0.742871,0.363889,0.060606,0.307802,0.299742,0.238162,0.893701,0.933071,0.570866,0.276872,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
425954,0.629315,0.233333,0.106061,0.352183,0.196382,0.390614,0.905512,0.897638,0.515748,0.079883,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
230408,0.47924,0.238889,0.166667,0.0,0.223514,0.198117,0.929134,0.874016,0.46063,0.322738,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [81]:
letter_col = letter_df[0]
letter_df.drop(columns=[0],inplace=True)

In [82]:
scaler = MinMaxScaler().fit(letter_df)
letter_df_p1 = pd.DataFrame(scaler.transform(letter_df),columns=letter_df.columns)
letter_df_p2 = pd.DataFrame(scaler.transform(letter_df),columns=letter_df.columns)

# two ways to make binary classification labels
letter_df_p1_pred = np.where(letter_col == 'O', 1, 0)
letter_df_p2_pred = np.where(letter_col.isin([chr(x) for x in range(ord('A'), ord('M') + 1)]), 1, 0)
letter_df_p1.insert(16,'pred',letter_df_p1_pred,True)
letter_df_p2.insert(16,'pred',letter_df_p2_pred,True)


letter_df_p1.sample(10, random_state=0)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,pred
19134,0.2,0.2,0.266667,0.133333,0.066667,0.266667,0.733333,0.133333,0.466667,0.733333,0.666667,0.333333,0.066667,0.733333,0.133333,0.333333,0
4981,0.2,0.333333,0.4,0.266667,0.266667,0.6,0.4,0.2,0.4,0.666667,0.333333,0.466667,0.133333,0.533333,0.333333,0.6,0
16643,0.266667,0.533333,0.266667,0.333333,0.133333,0.2,0.533333,0.533333,0.133333,0.466667,0.333333,0.733333,0.266667,0.533333,0.2,0.666667,0
19117,0.333333,0.666667,0.466667,0.466667,0.266667,0.266667,0.666667,0.133333,0.533333,0.733333,0.8,0.6,0.2,0.6,0.133333,0.4,0
5306,0.266667,0.466667,0.266667,0.533333,0.333333,0.533333,0.466667,0.4,0.266667,0.533333,0.466667,0.6,0.2,0.533333,0.4,0.533333,0
230,0.466667,0.666667,0.666667,0.533333,0.333333,0.466667,0.533333,0.2,0.333333,0.666667,0.4,0.466667,0.466667,0.533333,0.066667,0.533333,0
3148,0.4,0.666667,0.4,0.466667,0.466667,0.266667,0.666667,0.133333,0.2,0.6,0.533333,0.533333,0.466667,0.733333,0.133333,0.466667,0
11525,0.4,0.733333,0.4,0.533333,0.466667,0.333333,0.666667,0.533333,0.2,0.466667,0.266667,0.533333,0.133333,0.466667,0.333333,0.733333,0
13672,0.333333,0.6,0.533333,0.533333,0.6,0.466667,0.533333,0.333333,0.266667,0.533333,0.4,0.533333,0.466667,0.6,0.533333,0.533333,0
1624,0.266667,0.333333,0.333333,0.533333,0.133333,0.466667,0.466667,0.266667,0.933333,0.6,0.4,0.533333,0.0,0.533333,0.533333,0.533333,0


In [83]:
letter_df_p2.sample(10, random_state=0)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,pred
19134,0.2,0.2,0.266667,0.133333,0.066667,0.266667,0.733333,0.133333,0.466667,0.733333,0.666667,0.333333,0.066667,0.733333,0.133333,0.333333,0
4981,0.2,0.333333,0.4,0.266667,0.266667,0.6,0.4,0.2,0.4,0.666667,0.333333,0.466667,0.133333,0.533333,0.333333,0.6,1
16643,0.266667,0.533333,0.266667,0.333333,0.133333,0.2,0.533333,0.533333,0.133333,0.466667,0.333333,0.733333,0.266667,0.533333,0.2,0.666667,1
19117,0.333333,0.666667,0.466667,0.466667,0.266667,0.266667,0.666667,0.133333,0.533333,0.733333,0.8,0.6,0.2,0.6,0.133333,0.4,0
5306,0.266667,0.466667,0.266667,0.533333,0.333333,0.533333,0.466667,0.4,0.266667,0.533333,0.466667,0.6,0.2,0.533333,0.4,0.533333,0
230,0.466667,0.666667,0.666667,0.533333,0.333333,0.466667,0.533333,0.2,0.333333,0.666667,0.4,0.466667,0.466667,0.533333,0.066667,0.533333,0
3148,0.4,0.666667,0.4,0.466667,0.466667,0.266667,0.666667,0.133333,0.2,0.6,0.533333,0.533333,0.466667,0.733333,0.133333,0.466667,0
11525,0.4,0.733333,0.4,0.533333,0.466667,0.333333,0.666667,0.533333,0.2,0.466667,0.266667,0.533333,0.133333,0.466667,0.333333,0.733333,0
13672,0.333333,0.6,0.533333,0.533333,0.6,0.466667,0.533333,0.333333,0.266667,0.533333,0.4,0.533333,0.466667,0.6,0.533333,0.533333,1
1624,0.266667,0.333333,0.333333,0.533333,0.133333,0.466667,0.466667,0.266667,0.933333,0.6,0.4,0.533333,0.0,0.533333,0.533333,0.533333,0


In [9]:
%%time

# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('classifier', RandomForestClassifier())])
search_space = [{'classifier': [LogisticRegression(solver='saga')],
                 'classifier__penalty': ['none','l1','l2'],
                 'classifier__C': np.logspace(-8, 4, 13)},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1,2,6,12, 20]}]
# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=2), verbose=0)
# Fit grid search
best_model = clf.fit(covtype_df.iloc[:5000,:-1], covtype_df.iloc[:5000,-1])

  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "


KeyboardInterrupt: 

In [18]:
X, Y = covtype_df.iloc[:10000,:-1], covtype_df.iloc[:10000,-1]

In [19]:
%%time 

# take all our penguin data, and reserve 50% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    train_size=0.5,
                                                    random_state=0,
                                                    stratify=Y)


# Initializing Classifiers
clf1 = LogisticRegression(solver='saga',
                          random_state=0)
clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = SVC(random_state=0)

clf4 = RandomForestClassifier(random_state=0)

clf5 = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)

# clf6 = OrthogonalMatchingPursuit()

# Building the pipelines
pipe1 = Pipeline([('classifier', clf1)])

pipe2 = Pipeline([('classifier', clf2)])

pipe3 = Pipeline([('classifier', clf3)])

pipe4 = Pipeline([('classifier', clf4)])

pipe5 = Pipeline([('classifier', clf5)])

# pipe6 = Pipeline([('classifier', clf6)])


# Setting up the parameter grids
param_grid1 = [{'classifier__penalty': ['none', 'l1', 'l2'],
                'classifier__C': np.logspace(-8, 4, 13)}]

param_grid2 = [{'classifier__n_neighbors': np.arange(1, 100, 10)}]
#                 'classifier__p': [1, 2]}]

param_grid3 = [{'classifier__kernel': ['rbf'],
                'classifier__C': np.power(10., np.arange(-4, 4)),
                'classifier__gamma': np.power(10., np.arange(-5, 0))},
               {'classifier__kernel': ['linear'],                
                'classifier__C': np.power(10., np.arange(-4, 4))}]

param_grid4 = [{'classifier__n_estimators': [1024],
                'classifier__max_features': [1,2,4,6,8,12,16,20]}]

param_grid5 = [{'classifier__C': np.logspace(-8,4,13),
               'classifier__loss': ['hinge', 'squared_hinge']}]

# param_grid6 = [{'classifier__n_nonzero_coefs': [.01, .05, .1, .2, .6]}]

# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4,param_grid5),
                            (pipe1, pipe2, pipe3, pipe4, pipe5),
                            ('Logistic', 'KNN', 'SVM', 'RF', 'PAC')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=2, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv

Wall time: 7.95 ms


In [20]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you
warnings.filterwarnings('ignore')


cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

    #FYI: This code uses X_train.iloc[... ] instead of X_train[...] because the 
    # penguin data is in a Dataframe instead of a numpy matrix


outer fold 1/5 | tuning KNN      | inner ACC 87.70% | outer ACC 88.40%
outer fold 1/5 | tuning Logistic | inner ACC 87.35% | outer ACC 85.00%
outer fold 1/5 | tuning PAC      | inner ACC 86.20% | outer ACC 85.00%
outer fold 1/5 | tuning RF       | inner ACC 90.05% | outer ACC 90.60%
outer fold 1/5 | tuning SVM      | inner ACC 88.10% | outer ACC 87.20%
outer fold 2/5 | tuning KNN      | inner ACC 88.10% | outer ACC 89.60%
outer fold 2/5 | tuning Logistic | inner ACC 86.85% | outer ACC 87.00%
outer fold 2/5 | tuning PAC      | inner ACC 85.80% | outer ACC 84.80%
outer fold 2/5 | tuning RF       | inner ACC 89.65% | outer ACC 90.80%
outer fold 2/5 | tuning SVM      | inner ACC 87.90% | outer ACC 90.40%
outer fold 3/5 | tuning KNN      | inner ACC 87.60% | outer ACC 90.60%
outer fold 3/5 | tuning Logistic | inner ACC 86.20% | outer ACC 88.60%
outer fold 3/5 | tuning PAC      | inner ACC 85.90% | outer ACC 84.60%
outer fold 3/5 | tuning RF      

KeyboardInterrupt: 

In [None]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

In [None]:
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

In [None]:
X, Y = letter_df.iloc[:10000,1:-2], letter_df.iloc[:10000,-2]

In [None]:
%%time 


# take all our penguin data, and reserve 50% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    train_size=0.5,
                                                    random_state=0,
                                                    stratify=Y)


# Initializing Classifiers
clf1 = LogisticRegression(solver='saga',
                          random_state=0)

clf2 = KNeighborsClassifier(algorithm='ball_tree',
                            leaf_size=50)
clf3 = SVC(random_state=0)

clf4 = RandomForestClassifier(random_state=0)

clf5 = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)

# clf6 = OrthogonalMatchingPursuit()

# Building the pipelines
pipe1 = Pipeline([('classifier', clf1)])

pipe2 = Pipeline([('classifier', clf2)])

pipe3 = Pipeline([('classifier', clf3)])

pipe4 = Pipeline([('classifier', clf4)])

pipe5 = Pipeline([('classifier', clf5)])

# pipe6 = Pipeline([('classifier', clf6)])


# Setting up the parameter grids
param_grid1 = [{'classifier__penalty': ['none', 'l1', 'l2'],
                'classifier__C': np.logspace(-8, 4, 13)}]

param_grid2 = [{'classifier__n_neighbors': np.arange(1, 100, 10)}]
#                 'classifier__p': [1, 2]}]

param_grid3 = [{'classifier__kernel': ['rbf'],
                'classifier__C': np.power(10., np.arange(-4, 4)),
                'classifier__gamma': np.power(10., np.arange(-5, 0))},
               {'classifier__kernel': ['linear'],                
                'classifier__C': np.power(10., np.arange(-4, 4))}]

param_grid4 = [{'classifier__n_estimators': [1024],
                'classifier__max_features': [1,2,4,6,8,12,16,20]}]

param_grid5 = [{'classifier__C': np.logspace(-8,4,13),
               'classifier__loss': ['hinge', 'squared_hinge']}]

# param_grid6 = [{'classifier__n_nonzero_coefs': [.01, .05, .1, .2, .6]}]

# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4,param_grid5),
                            (pipe1, pipe2, pipe3, pipe4, pipe5),
                            ('Logistic', 'KNN', 'SVM', 'RF', 'PAC')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=2, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv

In [None]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you
warnings.filterwarnings('ignore')


cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

    #FYI: This code uses X_train.iloc[... ] instead of X_train[...] because the 
    # penguin data is in a Dataframe instead of a numpy matrix


In [None]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

In [None]:
# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

In [None]:
best_algo.cv_results_

In [None]:
len([covtype_df.iloc[5000:,:-1], covtype_df.iloc[5000:,-1]])

In [None]:
a = best_model.best_params_['classifier'].fit(covtype_df.iloc[:5000,:-1], covtype_df.iloc[:5000,-1]).predict(covtype_df.iloc[5000:,:-1])

In [None]:
np.mean(a)

In [None]:
best_model.best_params_

In [None]:
# parameters = {'C':[1],'penalty':['l2']}
X = covtype_df.drop(columns=[0])
y = np.where(covtype_df[16] == 2,1,0)
log_reg = LogisticRegression()
clf = GridSearchCV(estimator=log_reg,param_grid=parameters)
clf.fit(X,y)
print(clf.score(X))

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score

X = adult_df[['workclass', 'marital-status', 'occupation', 'relationship']]
y = np.where(adult_df.iloc[:,-1] == ' >50K',1,0)

column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                                        ['workclass', 'marital-status', 'occupation']),
                                      (OrdinalEncoder(), ['relationship']),
                                      remainder='drop')
logreg = LogisticRegression()
pipe = make_pipeline(column_trans, logreg)
cross_val_score(pipe, X, y, cv=10, scoring='f1')


<br>

In [None]:
for i in range 3: 
    run_classifiers()

In [None]:
def run_classifiers(data):
    iterations = 3
    for i in range(iterations):
        # draw 5k samples for training data, and set aside the rest for testing 
        X_train, Y_train, X_test, Y_test = draw_samples(data)
        # returns the gridsearchCV model list thing
        gridcvs = create_gridsearch()
        
        select_best_and_fit()
        
        get_statistics()
        
        send_brain_to_file()

In [22]:
def draw_samples(data, n = 5000):
    train_index = random.sample(range(0,len(data)), n)
    # assumes target column is last column
    X_train, Y_train = data.iloc[train_index, :-1], data.iloc[train_index, -1]
    
    test = data[~data.index.isin(train_index)]
    X_test, Y_test = test.iloc[:, :-1], test.iloc[:, -1]

    return X_train, Y_train, X_test, Y_test

In [28]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
0,0.368684,0.141667,0.045455,0.184681,0.223514,0.071659,0.870079,0.913386,0.582677,0.875366,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.365683,0.155556,0.030303,0.151754,0.215762,0.054798,0.866142,0.925197,0.594488,0.867838,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.472736,0.386111,0.136364,0.191840,0.307494,0.446817,0.921260,0.937008,0.531496,0.853339,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.463232,0.430556,0.272727,0.173228,0.375969,0.434172,0.937008,0.937008,0.480315,0.865886,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.368184,0.125000,0.030303,0.109520,0.222222,0.054939,0.866142,0.921260,0.590551,0.860449,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,0.268634,0.425000,0.303030,0.060845,0.245478,0.015175,0.944882,0.933071,0.464567,0.116688,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581008,0.266133,0.422222,0.287879,0.047960,0.239018,0.013348,0.944882,0.933071,0.468504,0.117803,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581009,0.263632,0.441667,0.257576,0.042949,0.232558,0.012646,0.929134,0.948819,0.511811,0.119058,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581010,0.262631,0.472222,0.227273,0.042949,0.229974,0.012646,0.905512,0.964567,0.562992,0.120452,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
X_train, Y_train, X_test, Y_test = draw_samples(covtype_df)

In [13]:
def create_gridsearch():
    scoring = {'accuracy' : 'accuracy',
               'f1' : 'f1', 
               'roc_auc' : 'roc_auc',
              }#            'MCC' : make_scorer(matthews_corrcoef)}
    
    # Initializing Classifiers
    clf1 = LogisticRegression(solver='saga',
                              random_state=0)

    clf2 = KNeighborsClassifier(algorithm='ball_tree',
                                leaf_size=50)
    clf3 = SVC(random_state=0)

    clf4 = RandomForestClassifier(random_state=0)

    clf5 = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)


    # Building the pipelines
    pipe1 = Pipeline([('classifier', clf1)])

    pipe2 = Pipeline([('classifier', clf2)])

    pipe3 = Pipeline([('classifier', clf3)])

    pipe4 = Pipeline([('classifier', clf4)])

    pipe5 = Pipeline([('classifier', clf5)])



    # Setting up the parameter grids
    param_grid1 = [{'classifier__penalty': ['none', 'l1', 'l2'],
                    'classifier__C': np.logspace(-8, 4, 13)}]

    param_grid2 = [{'classifier__n_neighbors': np.geomspace(1, 500, num=25, dtype=int),
                    'classifier__weights': ['uniform', 'distance']}]

    param_grid3 = [{'classifier__kernel': ['rbf'],
                    'classifier__C': np.power(10., np.arange(-7, 4)),
                    'classifier__gamma': [0.001,0.005,0.01,0.05,0.1,0.5,1,2]},
                   {'classifier__kernel': ['linear'],                
                    'classifier__C': np.power(10., np.arange(-7, 4))},
                   {'classifier__kernel': ['polynomial'],
                    'classifier__degree': [2,3],
                    'classifier__C': np.power(10., np.arange(-7, 4))}]

    param_grid4 = [{'classifier__n_estimators': [1024],
                    'classifier__max_features': [1,2,4,6,8,12,16, 20]}]

    param_grid5 = [{'classifier__C': np.logspace(-8,4,13),
                    'classifier__loss': ['hinge', 'squared_hinge']}]


    # Setting up multiple GridSearchCV objects, 1 for each algorithm
    gridcvs = {}

    for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4,param_grid5),
                                (pipe1, pipe2, pipe3, pipe4, pipe5),
                                ('Logistic', 'KNN', 'SVM', 'RF', 'PAC')):
        gcv = GridSearchCV(estimator=est,
                           param_grid=pgrid,
                           scoring='accuracy', #scoring
                           n_jobs=1,
                           cv=5, 
                           verbose=0,
                           refit=True)
        gridcvs[name] = gcv
    return gridcvs

In [14]:
gridcvs = create_gridsearch()

In [19]:
%%time
appapppoopoo = run_gridsearch(gridcvs, X_train, Y_train)

outer fold 1/5 | tuning KNN      | inner ACC 79.12% | outer ACC 79.10%
outer fold 1/5 | tuning Logistic | inner ACC 76.33% | outer ACC 75.90%
outer fold 1/5 | tuning PAC      | inner ACC 74.90% | outer ACC 73.70%
outer fold 1/5 | tuning RF       | inner ACC 80.92% | outer ACC 82.30%
outer fold 1/5 | tuning SVM      | inner ACC 79.47% | outer ACC 80.00%
outer fold 2/5 | tuning KNN      | inner ACC 77.65% | outer ACC 79.70%
outer fold 2/5 | tuning Logistic | inner ACC 76.20% | outer ACC 73.40%
outer fold 2/5 | tuning PAC      | inner ACC 75.28% | outer ACC 74.10%
outer fold 2/5 | tuning RF       | inner ACC 81.05% | outer ACC 79.90%
outer fold 2/5 | tuning SVM      | inner ACC 79.38% | outer ACC 80.40%
outer fold 3/5 | tuning KNN      | inner ACC 78.85% | outer ACC 78.90%
outer fold 3/5 | tuning Logistic | inner ACC 76.42% | outer ACC 75.40%
outer fold 3/5 | tuning PAC      | inner ACC 74.90% | outer ACC 72.50%
outer fold 3/5 | tuning RF       | inner ACC 81.35% | outer ACC 81.80%
outer 

In [None]:
gridcvs

In [None]:
for name, gs_est in sorted(gridcvs.items()):
    print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

    # The inner loop for hyperparameter tuning
    gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
    y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
    acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
    print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
          (gs_est.best_score_ * 100, acc * 100))
    cv_scores[name].append(acc)


In [18]:
def run_gridsearch(gridcvs, X_train, Y_train):
    warnings.filterwarnings('ignore')
    
    cv_scores = {name: [] for name, gs_est in gridcvs.items()}

    skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

    # The outer loop for algorithm selection
    c = 1
    for outer_train_idx, outer_valid_idx in skfold.split(X_train,Y_train):
        for name, gs_est in sorted(gridcvs.items()):
            print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

            # The inner loop for hyperparameter tuning
            gs_est.fit(X_train.iloc[outer_train_idx], Y_train.iloc[outer_train_idx])
            y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
            acc = accuracy_score(y_true=Y_train.iloc[outer_valid_idx], y_pred=y_pred)
            print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
                  (gs_est.best_score_ * 100, acc * 100))
            cv_scores[name].append(acc)

        c += 1
    return cv_scores

In [None]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [None]:
gridcvs.items()

In [None]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you
warnings.filterwarnings('ignore')


cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

    #FYI: This code uses X_train.iloc[... ] instead of X_train[...] because the 
    # penguin data is in a Dataframe instead of a numpy matrix


In [None]:
?GridSearchCV