In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import make_scorer

In [2]:
adult_df = pd.read_csv("adult.txt")
covtype_df = pd.read_csv("covtype.txt",names=list(range(0,55)))
letter_df = pd.read_csv("letter-recognition.txt",names=list(range(0,17))) 
#census_df = pd.read_csv("census-income.txt",names=list(range(0,42)))

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [4]:
imp_mean = SimpleImputer(missing_values=" ?", strategy="most_frequent").fit(adult_df)
adult_df = pd.DataFrame(imp_mean.transform(adult_df), columns=adult_df.columns)
adult_df = pd.get_dummies(adult_df, columns = ['workclass', 'education', 'marital-status', 'occupation', 
                                               'relationship', 'race', 'sex', 'native-country'])
# convert prediction data into binary labels
pred = np.where(adult_df['pred'] == ' >50K',1,0)
adult_df.drop(columns=['pred'], inplace=True)

# scale quantitative data
scaler = MinMaxScaler().fit(adult_df)
adult_df = pd.DataFrame(scaler.transform(adult_df),columns=adult_df.columns)
adult_df['pred'] = pred

adult_df.sample(10)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,education_ 7th-8th,education_ 9th,education_ Assoc-acdm,education_ Assoc-voc,education_ Bachelors,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital-status_ Divorced,marital-status_ Married-AF-spouse,marital-status_ Married-civ-spouse,marital-status_ Married-spouse-absent,marital-status_ Never-married,marital-status_ Separated,marital-status_ Widowed,occupation_ Adm-clerical,occupation_ Armed-Forces,occupation_ Craft-repair,occupation_ Exec-managerial,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,native-country_ Cambodia,native-country_ Canada,native-country_ China,native-country_ Columbia,native-country_ Cuba,native-country_ Dominican-Republic,native-country_ Ecuador,native-country_ El-Salvador,native-country_ England,native-country_ France,native-country_ Germany,native-country_ Greece,native-country_ Guatemala,native-country_ Haiti,native-country_ Holand-Netherlands,native-country_ Honduras,native-country_ Hong,native-country_ Hungary,native-country_ India,native-country_ Iran,native-country_ Ireland,native-country_ Italy,native-country_ Jamaica,native-country_ Japan,native-country_ Laos,native-country_ Mexico,native-country_ Nicaragua,native-country_ Outlying-US(Guam-USVI-etc),native-country_ Peru,native-country_ Philippines,native-country_ Poland,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,pred
24216,0.054795,0.059028,0.866667,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
7120,0.410959,0.064483,1.0,0.0,0.0,0.346939,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
24466,0.123288,0.137428,0.533333,0.0,0.0,0.346939,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4104,0.178082,0.038063,0.6,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
20887,0.09589,0.074626,0.6,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
31717,0.273973,0.165263,0.533333,0.0,0.0,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
31364,0.60274,0.068455,0.266667,0.0,0.0,0.581633,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
23761,0.410959,0.015059,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2492,0.123288,0.115036,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
296,0.273973,0.030656,0.8,0.0,0.0,0.602041,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [5]:
pred_column = covtype_df[54]
positive_label = pred_column.value_counts().index[0]

# convert prediction data into binary labels
pred_column = np.where(pred_column == positive_label, 1, 0)

covtype_df.drop(columns=[54],inplace=True)

In [6]:
scaler = MinMaxScaler().fit(covtype_df)
covtype_df = pd.DataFrame(scaler.transform(covtype_df),columns=covtype_df.columns)
covtype_df.insert(54,'pred',pred_column,True)
covtype_df.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,pred
237116,0.553277,0.516667,0.075758,0.167502,0.25323,0.256007,0.870079,0.956693,0.622047,0.299456,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
212504,0.678339,0.291667,0.151515,0.021475,0.229974,0.332022,0.933071,0.897638,0.472441,0.410428,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
435023,0.577289,0.811111,0.272727,0.322119,0.502584,0.366306,0.649606,0.917323,0.814961,0.219992,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
508010,0.544772,0.919444,0.606061,0.198282,0.361757,0.219896,0.397638,0.633858,0.712598,0.151122,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
192085,0.698349,0.847222,0.287879,0.136006,0.301034,0.707602,0.645669,0.889764,0.791339,0.196849,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
91969,0.513757,0.088889,0.393939,0.202577,0.286822,0.522833,0.80315,0.681102,0.362205,0.3792,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
483907,0.613807,0.736111,0.212121,0.021475,0.22739,0.426303,0.720472,0.964567,0.0,0.194061,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
296740,0.573287,0.413889,0.19697,0.303508,0.342377,0.356049,0.933071,0.940945,0.507874,0.240206,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
466629,0.689345,0.058333,0.121212,0.077309,0.223514,0.431362,0.846457,0.877953,0.570866,0.38394,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
56701,0.55978,0.605556,0.121212,0.27917,0.26615,0.626669,0.830709,0.976378,0.681102,0.781681,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [7]:
letter_col = letter_df[0]
letter_df.drop(columns=[0],inplace=True)

In [8]:
scaler = MinMaxScaler().fit(letter_df)
letter_df_p1 = pd.DataFrame(scaler.transform(letter_df),columns=letter_df.columns)
letter_df_p2 = pd.DataFrame(scaler.transform(letter_df),columns=letter_df.columns)

# two ways to make binary classification labels
letter_df_p1_pred = np.where(letter_col == 'O', 1, 0)
letter_df_p2_pred = np.where(letter_col.isin([chr(x) for x in range(ord('A'), ord('M') + 1)]), 1, 0)
letter_df_p1.insert(16,'pred',letter_df_p1_pred,True)
letter_df_p2.insert(16,'pred',letter_df_p2_pred,True)


letter_df_p1.sample(10, random_state=0)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,pred
19134,0.2,0.2,0.266667,0.133333,0.066667,0.266667,0.733333,0.133333,0.466667,0.733333,0.666667,0.333333,0.066667,0.733333,0.133333,0.333333,0
4981,0.2,0.333333,0.4,0.266667,0.266667,0.6,0.4,0.2,0.4,0.666667,0.333333,0.466667,0.133333,0.533333,0.333333,0.6,0
16643,0.266667,0.533333,0.266667,0.333333,0.133333,0.2,0.533333,0.533333,0.133333,0.466667,0.333333,0.733333,0.266667,0.533333,0.2,0.666667,0
19117,0.333333,0.666667,0.466667,0.466667,0.266667,0.266667,0.666667,0.133333,0.533333,0.733333,0.8,0.6,0.2,0.6,0.133333,0.4,0
5306,0.266667,0.466667,0.266667,0.533333,0.333333,0.533333,0.466667,0.4,0.266667,0.533333,0.466667,0.6,0.2,0.533333,0.4,0.533333,0
230,0.466667,0.666667,0.666667,0.533333,0.333333,0.466667,0.533333,0.2,0.333333,0.666667,0.4,0.466667,0.466667,0.533333,0.066667,0.533333,0
3148,0.4,0.666667,0.4,0.466667,0.466667,0.266667,0.666667,0.133333,0.2,0.6,0.533333,0.533333,0.466667,0.733333,0.133333,0.466667,0
11525,0.4,0.733333,0.4,0.533333,0.466667,0.333333,0.666667,0.533333,0.2,0.466667,0.266667,0.533333,0.133333,0.466667,0.333333,0.733333,0
13672,0.333333,0.6,0.533333,0.533333,0.6,0.466667,0.533333,0.333333,0.266667,0.533333,0.4,0.533333,0.466667,0.6,0.533333,0.533333,0
1624,0.266667,0.333333,0.333333,0.533333,0.133333,0.466667,0.466667,0.266667,0.933333,0.6,0.4,0.533333,0.0,0.533333,0.533333,0.533333,0


In [9]:
letter_df_p2.sample(10, random_state=0)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,pred
19134,0.2,0.2,0.266667,0.133333,0.066667,0.266667,0.733333,0.133333,0.466667,0.733333,0.666667,0.333333,0.066667,0.733333,0.133333,0.333333,0
4981,0.2,0.333333,0.4,0.266667,0.266667,0.6,0.4,0.2,0.4,0.666667,0.333333,0.466667,0.133333,0.533333,0.333333,0.6,1
16643,0.266667,0.533333,0.266667,0.333333,0.133333,0.2,0.533333,0.533333,0.133333,0.466667,0.333333,0.733333,0.266667,0.533333,0.2,0.666667,1
19117,0.333333,0.666667,0.466667,0.466667,0.266667,0.266667,0.666667,0.133333,0.533333,0.733333,0.8,0.6,0.2,0.6,0.133333,0.4,0
5306,0.266667,0.466667,0.266667,0.533333,0.333333,0.533333,0.466667,0.4,0.266667,0.533333,0.466667,0.6,0.2,0.533333,0.4,0.533333,0
230,0.466667,0.666667,0.666667,0.533333,0.333333,0.466667,0.533333,0.2,0.333333,0.666667,0.4,0.466667,0.466667,0.533333,0.066667,0.533333,0
3148,0.4,0.666667,0.4,0.466667,0.466667,0.266667,0.666667,0.133333,0.2,0.6,0.533333,0.533333,0.466667,0.733333,0.133333,0.466667,0
11525,0.4,0.733333,0.4,0.533333,0.466667,0.333333,0.666667,0.533333,0.2,0.466667,0.266667,0.533333,0.133333,0.466667,0.333333,0.733333,0
13672,0.333333,0.6,0.533333,0.533333,0.6,0.466667,0.533333,0.333333,0.266667,0.533333,0.4,0.533333,0.466667,0.6,0.533333,0.533333,1
1624,0.266667,0.333333,0.333333,0.533333,0.133333,0.466667,0.466667,0.266667,0.933333,0.6,0.4,0.533333,0.0,0.533333,0.533333,0.533333,0


<br>

In [71]:
def progressBar(current, total, barLength = 20):
    percent = float(current) * 100 / total
    arrow   = '-' * int(percent/100 * barLength - 1) + '>'
    spaces  = ' ' * (barLength - len(arrow))

    print('Progress: [%s%s] %d %%' % (arrow, spaces, percent), end='\r')

In [73]:
def run_classifiers(data):
    out = []
    
    iterations = 3
    for i in range(iterations):
        # draw 5k samples for training data, and set aside the rest for testing 
        X_train, Y_train, X_test, Y_test = draw_samples(data)
        # returns the gridsearchCV model list thing
        gridcvs = create_gridsearch()
        
        cvscores = run_gridsearch(gridcvs, X_train, Y_train)
        
        best_algos = best_model_selection(gridcvs, X_train, Y_train)
        
        stats = output_statistics(best_algos, X_train, Y_train, X_test, Y_test)
        
        out.append([gridcvs, cvscores, best_algos, stats])
        
    return out

In [17]:
def draw_samples(data, n = 5000):
    train_index = random.sample(range(0,len(data)), n)
    # assumes target column is last column
    X_train, Y_train = data.iloc[train_index, :-1], data.iloc[train_index, -1]
    
    test = data[~data.index.isin(train_index)]
    X_test, Y_test = test.iloc[:, :-1], test.iloc[:, -1]

    return X_train, Y_train, X_test, Y_test

In [19]:
def create_gridsearch():
    # Initializing Classifiers
    clf1 = LogisticRegression(solver='saga',
                              random_state=0)

    clf2 = KNeighborsClassifier(algorithm='ball_tree',
                                leaf_size=50)
    clf3 = SVC(random_state=0)

    clf4 = RandomForestClassifier(random_state=0)

    clf5 = PassiveAggressiveClassifier(max_iter=5000, random_state=0, tol=1e-3)


    # Building the pipelines
    pipe1 = Pipeline([('classifier', clf1)])

    pipe2 = Pipeline([('classifier', clf2)])

    pipe3 = Pipeline([('classifier', clf3)])

    pipe4 = Pipeline([('classifier', clf4)])

    pipe5 = Pipeline([('classifier', clf5)])



    # Setting up the parameter grids
    param_grid1 = [{'classifier__penalty': ['none', 'l1', 'l2'],
                    'classifier__C': np.logspace(-8, 4, 13)}]

    param_grid2 = [{'classifier__n_neighbors': np.geomspace(1, 500, num=25, dtype=int),
                    'classifier__weights': ['uniform', 'distance']}]

    param_grid3 = [{'classifier__kernel': ['rbf'],
                    'classifier__C': np.power(10., np.arange(-7, 4)),
                    'classifier__gamma': [0.001,0.005,0.01,0.05,0.1,0.5,1,2]},
                   {'classifier__kernel': ['linear'],                
                    'classifier__C': np.power(10., np.arange(-7, 4))},
                   {'classifier__kernel': ['polynomial'],
                    'classifier__degree': [2,3],
                    'classifier__C': np.power(10., np.arange(-7, 4))}]

    param_grid4 = [{'classifier__n_estimators': [1024],
                    'classifier__max_features': [1,2,4,6,8,12,16, 20]}]

    param_grid5 = [{'classifier__C': np.logspace(-8,4,13),
                    'classifier__loss': ['hinge', 'squared_hinge']}]


    # Setting up multiple GridSearchCV objects, 1 for each algorithm
    gridcvs = {}

    for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3, param_grid4,param_grid5),
                                (pipe1, pipe2, pipe3, pipe4, pipe5),
                                ('Logistic', 'KNN', 'SVM', 'RF', 'PAC')):
        gcv = GridSearchCV(estimator=est,
                           param_grid=pgrid,
                           scoring='accuracy', #scoring
                           n_jobs=1,
                           cv=5, 
                           verbose=0,
                           refit=True)
        gridcvs[name] = gcv
    return gridcvs

In [20]:
def run_gridsearch(gridcvs, X_train, Y_train):
    warnings.filterwarnings('ignore')
    
    cv_scores = {name: [] for name, gs_est in gridcvs.items()}

    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    # The outer loop for algorithm selection
    c = 1
    for outer_train_idx, outer_valid_idx in skfold.split(X_train,Y_train):
        for name, gs_est in sorted(gridcvs.items()):
            print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

            # The inner loop for hyperparameter tuning
            gs_est.fit(X_train.iloc[outer_train_idx], Y_train.iloc[outer_train_idx])
            y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
            acc = accuracy_score(y_true=Y_train.iloc[outer_valid_idx], y_pred=y_pred)
            print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
                  (gs_est.best_score_ * 100, acc * 100))
            cv_scores[name].append(acc)

        c += 1
    # Looking at the results
    for name in cv_scores:
        print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
              name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
    print()
    for name in cv_scores:
        print('{} best parameters'.format(name), gridcvs[name].best_params_)
        
    return cv_scores

In [44]:
def best_model_selection(gridcvs, X_train, Y_train):
    
    # Fitting a model to the whole training set using the proposed best algorithm per class
    best_algos = []
    for model_class in ['Logistic', 'KNN', 'SVM', 'RF', 'PAC']:
        best_algo = gridcvs[model_class]

        best_algo.fit(X_train, Y_train)  ## TODO: add timing
        best_algos.append(best_algo)
    return best_algos

In [53]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [68]:
def output_statistics(best_algos, X_train, Y_train, X_test, Y_test):
    
    # wrapper function to get error metrics
    def get_score(func, best_algo, X_train, Y_train, X_test, Y_test):
        
        train_metric = func(y_true=Y_train, y_pred=best_algo.predict(X_train))
        test_metric = func(y_true=Y_test, y_pred=best_algo.predict(X_test))
        
        return train_metric, test_metric
    
    # for each algorithm, get the list of train and test metrics
    algo_metrics = dict()
    for name, algo in zip(['Logistic', 'KNN', 'SVM', 'RF', 'PAC'], best_algos):
        
        train_metric, test_metric = dict(), dict()
        for metric in [accuracy_score, matthews_corrcoef, f1_score]:
            train, test = get_score(metric, algo, X_train, Y_train, X_test, Y_test)

            train_metric[metric.__name__] = train
            test_metric[metric.__name__] = test
            
        algo_metrics[name] = [train_metric, test_metric]
        
    return algo_metrics

In [None]:
%%time
data = run_classifiers(letter_df_p1)

outer fold 1/5 | tuning KNN      | inner ACC 98.62% | outer ACC 99.00%
outer fold 1/5 | tuning Logistic | inner ACC 96.40% | outer ACC 96.40%
outer fold 1/5 | tuning PAC      | inner ACC 96.40% | outer ACC 96.40%
outer fold 1/5 | tuning RF       | inner ACC 98.50% | outer ACC 98.10%
outer fold 1/5 | tuning SVM     

In [None]:
%%time
bigdata = []
for df in [adult_df, covtype_df, letter_df_p1, letter_df_p2]:
    data = run_classifiers(df)
    bigdata.append(data)

In [None]:
X_train, Y_train, X_test, Y_test = draw_samples(covtype_df)
gridcvs = create_gridsearch()
appapppoopoo = run_gridsearch(gridcvs, X_train, Y_train)
best_algos = best_model_selection(gridcvs, X_train, Y_train)
stats = output_statistics(best_algos, X_train, Y_train, X_test, Y_test)