In [68]:
# import packages

import numpy as np
import pandas as pd
import sklearn.model_selection
import sklearn.svm
import sklearn.tree
import sklearn.neighbors
import sklearn.preprocessing
import sklearn.linear_model
import sklearn.ensemble
import sklearn.metrics
import sklearn.neural_network

In [4]:
# read in all datasets

ADULT_df = pd.read_csv('./dataset/adult.data', names=['age', 'workclass', 
                                                      'fnlwgt', 'education', 'education-num','marital-status',
                                                     'occupation', 'relationship','race', 'sex',
                                                     'capital-gain', 'capital-loss','hoursperweek',
                                                     'country', 'label']) # ADULT dataset
AUG_df = pd.read_csv('./dataset/aug_train.csv') # data scientists dataset
BANK_df = pd.read_csv('./dataset/bank-full.csv', delimiter=';')
INS_df = pd.read_csv('./dataset/insurance_train.csv') #insurance dataset
AUS_df = pd.read_csv('./dataset/weatherAUS.csv') # australian weather dataset
LETTER_df = pd.read_csv('./dataset/letter.data', names=['lettr','xbox','ybox','width','high','onpix', 'xbar', 'ybar', 'x2bar', 'y2bar', 'xybar','x2ybr','xy2br','xege','xegvy','yege','yegvx'])

# Preprocess ADULT

In [5]:
ADULT_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hoursperweek,country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Encode Categorical value and label

In [5]:
labelenc = sklearn.preprocessing.OrdinalEncoder()
ADULT_df2 = ADULT_df.copy()
ADULT_df2[['workclass','marital-status','occupation','relationship','race','sex','country']] = labelenc.fit_transform(ADULT_df2[['workclass','marital-status','occupation','relationship','race','sex','country']])

In [6]:
ADULT_categorical = ADULT_df2[['workclass','marital-status','occupation','relationship','race','sex','country']]

In [7]:
ADULT_df2 = ADULT_df2.drop(['workclass', 'education','marital-status','occupation','relationship','race','sex','country'],axis=1)

In [8]:
ADULT_df['label'] = ADULT_df2['label'].apply(lambda x: 1 if x[1] == '>' else 0)

In [9]:
ADULT_df = ADULT_df.drop(['workclass','marital-status','occupation','relationship','race','sex','country'],axis=1)

In [10]:
ADULT_df = ADULT_df.join(ADULT_categorical)

In [54]:
ADULT_df.drop(['education'], axis=1).to_csv('./dataset/ADULT_processed.csv')

## ADULT - Train-test split

Train size 5000, test size 27561

In [12]:
adult_train, adult_test = sklearn.model_selection.train_test_split(ADULT_df, train_size=5000)

# Preprocess AUG

In [33]:
AUG_df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [34]:
categorical_cols = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline','experience','company_size','company_type','last_new_job']

In [35]:
AUG_categorical = AUG_df[categorical_cols]

In [36]:
AUG_df.drop(categorical_cols, axis=1, inplace=True)

In [37]:
AUG_df

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
0,8949,0.920,36,1.0
1,29725,0.776,47,0.0
2,11561,0.624,83,0.0
3,33241,0.789,52,1.0
4,666,0.767,8,0.0
...,...,...,...,...
19153,7386,0.878,42,1.0
19154,31398,0.920,52,1.0
19155,24576,0.920,44,0.0
19156,5756,0.802,97,0.0


In [38]:
AUG_categorical = AUG_categorical.fillna('Unknown')

In [39]:
AUG_categorical

Unnamed: 0,city,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job
0,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Unknown,Unknown,1
1,city_40,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4
2,city_21,Unknown,No relevent experience,Full time course,Graduate,STEM,5,Unknown,Unknown,never
3,city_115,Unknown,No relevent experience,Unknown,Graduate,Business Degree,<1,Unknown,Pvt Ltd,never
4,city_162,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4
...,...,...,...,...,...,...,...,...,...,...
19153,city_173,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,Unknown,Unknown,1
19154,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,Unknown,Unknown,4
19155,city_103,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4
19156,city_65,Male,Has relevent experience,no_enrollment,High School,Unknown,<1,500-999,Pvt Ltd,2


In [43]:
labelenc = sklearn.preprocessing.OrdinalEncoder()
AUG_categorical[categorical_cols] = labelenc.fit_transform(AUG_categorical)

In [45]:
AUG_df = AUG_df.join(AUG_categorical)

In [46]:
AUG_df

Unnamed: 0,enrollee_id,city_development_index,training_hours,target,city,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job
0,8949,0.920,36,1.0,5.0,1.0,0.0,3.0,0.0,5.0,21.0,8.0,6.0,0.0
1,29725,0.776,47,0.0,77.0,1.0,1.0,3.0,0.0,5.0,6.0,4.0,5.0,4.0
2,11561,0.624,83,0.0,64.0,3.0,1.0,0.0,0.0,5.0,15.0,8.0,6.0,6.0
3,33241,0.789,52,1.0,14.0,3.0,1.0,2.0,0.0,1.0,20.0,8.0,5.0,6.0
4,666,0.767,8,0.0,50.0,1.0,0.0,3.0,2.0,5.0,21.0,4.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,0.878,42,1.0,55.0,1.0,1.0,3.0,0.0,2.0,5.0,8.0,6.0,0.0
19154,31398,0.920,52,1.0,5.0,1.0,0.0,3.0,0.0,5.0,5.0,8.0,6.0,3.0
19155,24576,0.920,44,0.0,5.0,1.0,0.0,3.0,0.0,5.0,21.0,4.0,5.0,3.0
19156,5756,0.802,97,0.0,94.0,1.0,0.0,3.0,1.0,6.0,20.0,5.0,5.0,1.0


In [49]:
AUG_df.to_csv('./dataset/AUG_processed.csv') # save preprocessed to file

# Preprocess BANK

In [56]:
BANK_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [31]:
# binary columns use 0 and 1
BANK_df['default'] = BANK_df['default'].apply(lambda x : 1 if x == 'yes' else 0)
BANK_df['housing'] = BANK_df['housing'].apply(lambda x : 1 if x == 'yes' else 0)
BANK_df['loan'] = BANK_df['loan'].apply(lambda x : 1 if x == 'yes' else 0)
BANK_df['y'] = BANK_df['y'].apply(lambda x : 1 if x == 'yes' else 0)

In [32]:
BANK_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0,2143,1,0,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,0,29,1,0,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,0,1,0,0,unknown,5,may,198,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,0,825,0,0,cellular,17,nov,977,3,-1,0,unknown,1
45207,71,retired,divorced,primary,0,1729,0,0,cellular,17,nov,456,2,-1,0,unknown,1
45208,72,retired,married,secondary,0,5715,0,0,cellular,17,nov,1127,5,184,3,success,1
45209,57,blue-collar,married,secondary,0,668,0,0,telephone,17,nov,508,4,-1,0,unknown,0


In [34]:
BANK_categorical_cols = ['job', 'marital', 'education', 'contact', 'poutcome']
BANK_categorical = BANK_df[BANK_categorical_cols]
BANK_df.drop(BANK_categorical_cols, inplace=True, axis=1)

In [43]:
enc = sklearn.preprocessing.OneHotEncoder()
BANK_onehot = pd.DataFrame(enc.fit_transform(BANK_categorical).toarray(), dtype=int)

In [45]:
BANK_df = BANK_df.join(BANK_onehot)

In [47]:
# ignore dates
BANK_df.drop(['day', 'month'], axis=1, inplace=True)

In [48]:
BANK_df # processed bank dataset

Unnamed: 0,age,default,balance,housing,loan,duration,campaign,pdays,previous,y,...,16,17,18,19,20,21,22,23,24,25
0,58,0,2143,1,0,261,1,-1,0,0,...,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,151,1,-1,0,0,...,1,0,0,0,0,1,0,0,0,1
2,33,0,2,1,1,76,1,-1,0,0,...,1,0,0,0,0,1,0,0,0,1
3,47,0,1506,1,0,92,1,-1,0,0,...,0,0,1,0,0,1,0,0,0,1
4,33,0,1,0,0,198,1,-1,0,0,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,825,0,0,977,3,-1,0,1,...,0,1,0,1,0,0,0,0,0,1
45207,71,0,1729,0,0,456,2,-1,0,1,...,0,0,0,1,0,0,0,0,0,1
45208,72,0,5715,0,0,1127,5,184,3,1,...,1,0,0,1,0,0,0,0,1,0
45209,57,0,668,0,0,508,4,-1,0,0,...,1,0,0,0,1,0,0,0,0,1


In [49]:
BANK_df.to_csv('./dataset/BANK_processed.csv')

# Process INS

In [52]:
INS_df

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...
381104,381105,Male,74,1,26.0,1,1-2 Year,No,30170.0,26.0,88,0
381105,381106,Male,30,1,37.0,1,< 1 Year,No,40016.0,152.0,131,0
381106,381107,Male,21,1,30.0,1,< 1 Year,No,35118.0,160.0,161,0
381107,381108,Female,68,1,14.0,0,> 2 Years,Yes,44617.0,124.0,74,0


In [55]:
INS_df['Vehicle_Age'].unique()

array(['> 2 Years', '1-2 Year', '< 1 Year'], dtype=object)

In [56]:
# encode binary 
INS_df['Gender'] = INS_df['Gender'].apply(lambda x : 1 if x[0] == 'M' else 0)
INS_df['Vehicle_Damage'] = INS_df['Vehicle_Damage'].apply(lambda x : 1 if x[0] == 'Y' else 0)

In [81]:
INS_onehotenc= sklearn.preprocessing.OneHotEncoder()

In [None]:
INS_onehot = INS_onehotenc.fit_transform(INS_df['Vehicle_Age'].to_numpy().reshape(-1, 1))

In [90]:
INS_onehot = pd.DataFrame(INS_onehot, dtype=int)

In [88]:
INS_df.drop('Vehicle_Age', inplace=True, axis=1)

In [91]:
INS_df = INS_df.join(INS_onehot)

In [96]:
# too large, randomly sample 20000 subsets
INS_df = INS_df.sample(20000).reset_index()

In [97]:
INS_df.to_csv('./dataset/INS_processed.csv')

# Process LETTER

In [105]:
LETTER_df

Unnamed: 0,lettr,xbox,ybox,width,high,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybr,xy2br,xege,xegvy,yege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,D,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3,7
19996,C,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3,7
19997,T,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2,4
19998,S,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5,8


In [109]:
ord('M')

77

In [111]:
LETTER_df['label'] = LETTER_df['lettr'].apply(lambda x : 1 if ord(x) <= 77 else 0)

In [113]:
LETTER_df.to_csv('./dataset/LETTER_processed.csv')

# Read processed datasets

In [22]:
AUG_df = pd.read_csv("./dataset/AUG_processed.csv").drop('Unnamed: 0',axis=1)
ADULT_df = pd.read_csv("./dataset/ADULT_processed.csv").drop('Unnamed: 0', axis=1)
BANK_df = pd.read_csv("./dataset/BANK_processed.csv").drop('Unnamed: 0', axis=1)
INS_df = pd.read_csv("./dataset/INS_processed.csv").drop('Unnamed: 0', axis=1)
LETTER_df = pd.read_csv("./dataset/LETTER_processed.csv").drop('Unnamed: 0', axis=1)

In [89]:
# All estimators used
estimators = [sklearn.svm.SVC(class_weight='balanced', max_iter=10000), 
              sklearn.linear_model.LogisticRegression(class_weight='balanced', max_iter=10000),
              sklearn.tree.DecisionTreeClassifier(class_weight='balanced'), 
             sklearn.neighbors.KNeighborsClassifier(),
              sklearn.ensemble.RandomForestClassifier(class_weight='balanced', n_estimators=500), 
             sklearn.neural_network.MLPClassifier(max_iter=5000)]
params = [[{'C':[10e-5, 10e-4, 10e-3, 10e-2, 1, 10], 'kernel': ['rbf', 'linear', 'sigmoid']}, 
           {'C':[10e-5, 10e-4, 10e-3, 10e-2, 1, 10], 'degree':[2,3], 'kernel':['poly']}],
          [{'C':[10e-5, 10e-4, 10e-3, 10e-2, 1, 10], 'penalty':['l2'], 'solver':['newton-cg', 'sag','lbfgs', 'saga']},
            {'C':[10e-5, 10e-4, 10e-3, 10e-2, 1, 10], 'penalty': ['l1'], 'solver': ['saga']}],
         {'criterion': ['gini', 'entropy'], 'splitter':['best', 'random'], 'max_features':['sqrt', 'log2']},
         {'n_neighbors':np.arange(1, 103, 4), 'weights':['uniform', 'distance']},
         {'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2']}, 
         {'hidden_layer_sizes': [(64,), (16,), (32,), (128,)], "momentum":[0.2, 0.5, 0.9], "activation":['tanh', 'relu']}]

# 5 Trials on AUG

In [82]:
from collections import defaultdict
AUG_res = defaultdict(list)
AUG_perf = defaultdict(defaultdict)

In [92]:
# Run this 5 times
for idx, c in enumerate(estimators):
    grid = params[idx]
    clf = sklearn.model_selection.GridSearchCV(estimator=c, param_grid=grid, n_jobs=-1, verbose=6)
    name = c.__class__.__name__
    AUG_train, AUG_test = sklearn.model_selection.train_test_split(AUG_df, train_size=5000)
    if name in ['SVC', 'LogisticRegression', 'KNeighborsClassifier']:
        scaler = sklearn.preprocessing.StandardScaler()
        scaler.fit(AUG_train.loc[:, AUG_train.columns != 'target'])
        clf.fit(scaler.transform(AUG_train.loc[:, AUG_train.columns != 'target']), AUG_train['target'])
    else:
        clf.fit(AUG_train.loc[:, AUG_train.columns != 'target'], AUG_train['target'])
    AUG_res[name].append(clf.cv_results_)
    test_pred = clf.best_estimator_.predict(AUG_test.loc[:, AUG_test.columns != 'target'])
    if not 'f1' in AUG_perf[name]:
        AUG_perf[name]['f1'] = 0
    if not 'accuracy' in AUG_perf[name]:
        AUG_perf[name]['accuracy'] = 0
    if not 'rocauc' in AUG_perf[name]:
        AUG_perf[name]['rocauc'] = 0
    AUG_perf[name]['f1'] += sklearn.metrics.f1_score(AUG_test['target'], test_pred)
    AUG_perf[name]['accuracy'] += sklearn.metrics.accuracy_score(AUG_test['target'], test_pred)
    AUG_perf[name]['rocauc'] += sklearn.metrics.roc_auc_score(AUG_test['target'], test_pred)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 52 candidates, totalling 260 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [93]:
for c in AUG_perf:
    for perf in AUG_perf[c]:
        AUG_perf[c][perf] /= 5

In [94]:
AUG_perf

defaultdict(collections.defaultdict,
            {'SVC': defaultdict(None,
                         {'f1': 0.3221461222459219,
                          'accuracy': 0.5328718745585534,
                          'rocauc': 0.5345231869617707}),
             'LogisticRegression': defaultdict(None,
                         {'f1': 0.08088164368524423,
                          'accuracy': 0.7476762254555729,
                          'rocauc': 0.5130997429582752}),
             'DecisionTreeClassifier': defaultdict(None,
                         {'f1': 0.42485221585512695,
                          'accuracy': 0.7155247916372368,
                          'rocauc': 0.6171055603861254}),
             'KNeighborsClassifier': defaultdict(None,
                         {'f1': 0.0,
                          'accuracy': 0.7518434807176155,
                          'rocauc': 0.5}),
             'RandomForestClassifier': defaultdict(None,
                         {'f1': 0.4984279966914954,
       

In [96]:
np.save('./result/AUG_res', AUG_res)

# 5 Trials on ADULT

In [97]:
ADULT_res = defaultdict(list)
ADULT_perf = defaultdict(defaultdict)

In [105]:
# Run this 5 times
for idx, c in enumerate(estimators):
    grid = params[idx]
    clf = sklearn.model_selection.GridSearchCV(estimator=c, param_grid=grid, n_jobs=-1, verbose=6)
    name = c.__class__.__name__
    ADULT_train, ADULT_test = sklearn.model_selection.train_test_split(ADULT_df, train_size=5000)
    if name in ['SVC', 'LogisticRegression', 'KNeighborsClassifier']:
        scaler = sklearn.preprocessing.StandardScaler()
        scaler.fit(ADULT_train.loc[:, ADULT_train.columns != 'label'])
        clf.fit(scaler.transform(ADULT_train.loc[:, ADULT_train.columns != 'label']), ADULT_train['label'])
    else:
        clf.fit(ADULT_train.loc[:, ADULT_train.columns != 'label'], ADULT_train['label'])
    ADULT_res[name].append(clf.cv_results_)
    test_pred = clf.best_estimator_.predict(ADULT_test.loc[:, ADULT_test.columns != 'label'])
    if not 'f1' in ADULT_perf[name]:
        ADULT_perf[name]['f1'] = 0
    if not 'accuracy' in ADULT_perf[name]:
        ADULT_perf[name]['accuracy'] = 0
    if not 'rocauc' in ADULT_perf[name]:
        ADULT_perf[name]['rocauc'] = 0
    ADULT_perf[name]['f1'] += sklearn.metrics.f1_score(ADULT_test['label'], test_pred)
    ADULT_perf[name]['accuracy'] += sklearn.metrics.accuracy_score(ADULT_test['label'], test_pred)
    ADULT_perf[name]['rocauc'] += sklearn.metrics.roc_auc_score(ADULT_test['label'], test_pred)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 52 candidates, totalling 260 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [106]:
for c in ADULT_perf:
    for perf in ADULT_perf[c]:
        ADULT_perf[c][perf] /= 5

In [107]:
ADULT_perf

defaultdict(collections.defaultdict,
            {'SVC': defaultdict(None,
                         {'f1': 0.06393663234712732,
                          'accuracy': 0.76696781684264,
                          'rocauc': 0.5163444741321268}),
             'LogisticRegression': defaultdict(None,
                         {'f1': 0.3875846184434898,
                          'accuracy': 0.2403758934726606,
                          'rocauc': 0.5}),
             'DecisionTreeClassifier': defaultdict(None,
                         {'f1': 0.5972109372973384,
                          'accuracy': 0.8048329160770654,
                          'rocauc': 0.735040193237056}),
             'KNeighborsClassifier': defaultdict(None,
                         {'f1': 0.04544507078235465,
                          'accuracy': 0.7647037480497805,
                          'rocauc': 0.5115960276127225}),
             'RandomForestClassifier': defaultdict(None,
                         {'f1': 0.6629424552048

In [108]:
np.save('./result/ADULT_res', ADULT_res)

# 5 Trials on Bank

In [109]:
BANK_res = defaultdict(list)
BANK_perf = defaultdict(defaultdict)

In [110]:
BANK_df

Unnamed: 0,age,default,balance,housing,loan,duration,campaign,pdays,previous,y,...,16,17,18,19,20,21,22,23,24,25
0,58,0,2143,1,0,261,1,-1,0,0,...,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,151,1,-1,0,0,...,1,0,0,0,0,1,0,0,0,1
2,33,0,2,1,1,76,1,-1,0,0,...,1,0,0,0,0,1,0,0,0,1
3,47,0,1506,1,0,92,1,-1,0,0,...,0,0,1,0,0,1,0,0,0,1
4,33,0,1,0,0,198,1,-1,0,0,...,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,825,0,0,977,3,-1,0,1,...,0,1,0,1,0,0,0,0,0,1
45207,71,0,1729,0,0,456,2,-1,0,1,...,0,0,0,1,0,0,0,0,0,1
45208,72,0,5715,0,0,1127,5,184,3,1,...,1,0,0,1,0,0,0,0,1,0
45209,57,0,668,0,0,508,4,-1,0,0,...,1,0,0,0,1,0,0,0,0,1


In [113]:
for i in range(5):
    for idx, c in enumerate(estimators):
        grid = params[idx]
        clf = sklearn.model_selection.GridSearchCV(estimator=c, param_grid=grid, n_jobs=-1, verbose=6)
        name = c.__class__.__name__
        BANK_train, BANK_test = sklearn.model_selection.train_test_split(BANK_df, train_size=5000)
        if name in ['SVC', 'LogisticRegression', 'KNeighborsClassifier']:
            scaler = sklearn.preprocessing.StandardScaler()
            scaler.fit(BANK_train.loc[:, BANK_train.columns != 'y'])
            clf.fit(scaler.transform(BANK_train.loc[:, BANK_train.columns != 'y']), BANK_train['y'])
        else:
            clf.fit(BANK_train.loc[:, BANK_train.columns != 'y'], BANK_train['y'])
        BANK_res[name].append(clf.cv_results_)
        test_pred = clf.best_estimator_.predict(BANK_test.loc[:, BANK_test.columns != 'y'])
        if not 'f1' in BANK_perf[name]:
            BANK_perf[name]['f1'] = 0
        if not 'accuracy' in BANK_perf[name]:
            BANK_perf[name]['accuracy'] = 0
        if not 'rocauc' in BANK_perf[name]:
            BANK_perf[name]['rocauc'] = 0
        BANK_perf[name]['f1'] += sklearn.metrics.f1_score(BANK_test['y'], test_pred)
        BANK_perf[name]['accuracy'] += sklearn.metrics.accuracy_score(BANK_test['y'], test_pred)
        BANK_perf[name]['rocauc'] += sklearn.metrics.roc_auc_score(BANK_test['y'], test_pred)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 52 candidates, totalling 260 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 52 candidates, totalling 260 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 52 candidates, totalling 260 fits
Fitting 5 folds fo

In [114]:
for c in BANK_perf:
    for perf in BANK_perf[c]:
        BANK_perf[c][perf] /= 5

In [115]:
BANK_perf

defaultdict(collections.defaultdict,
            {'SVC': defaultdict(None,
                         {'f1': 0.21423960016964175,
                          'accuracy': 0.17205242346621571,
                          'rocauc': 0.5142499287498472}),
             'LogisticRegression': defaultdict(None,
                         {'f1': 0.2090879168455416,
                          'accuracy': 0.11705254781030068,
                          'rocauc': 0.5001942633415711}),
             'DecisionTreeClassifier': defaultdict(None,
                         {'f1': 0.36744745678715646,
                          'accuracy': 0.8547760563030018,
                          'rocauc': 0.6406246503609238}),
             'KNeighborsClassifier': defaultdict(None,
                         {'f1': 0.1491760388611002,
                          'accuracy': 0.7366839919425033,
                          'rocauc': 0.5122460383213963}),
             'RandomForestClassifier': defaultdict(None,
                         {'

In [116]:
np.save('./result/BANK_res', BANK_res)