In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn import naive_bayes
from sklearn import svm


import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None


import pickle
import df_trimmer


In [5]:
df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
df.shape

(24150, 91)

# function for testing

In [6]:
# for key, value in df_dict.items(model_function):
    
#     df = value
#     X = df[0]
#     y = df[1]

def model_trainer(model_function, X, y):
    roc_auc_list = []
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state=42)

    kf = StratifiedKFold(n_splits=5, random_state=42)

    for train_ind, val_ind in kf.split(X_train, y_train):

        X_ktrain, X_kval = X.iloc[train_ind], X.iloc[val_ind]
        y_ktrain, y_kval = y.iloc[train_ind], y.iloc[val_ind]

        ros = RandomOverSampler(random_state=42)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_ktrain, y_ktrain)

        reg = model_function
        reg.fit(X_train_resampled,y_train_resampled)
        y_pred = reg.predict_proba(X_kval)[:,1]
        score = roc_auc_score(y_kval, y_pred)

        roc_auc_list.append(score)

    mean_roc_auc = sum(roc_auc_list)/len(roc_auc_list)

    #print(sum(roc_auc_list)/len(roc_auc_list))
    print(mean_roc_auc)
    return mean_roc_auc

# Logistic

In [7]:
df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df[mask]
df_dict = {'all_features':df_trimmer.all_features(df),
           'no_targ_pop':df_trimmer.no_targ_pop(df),
           'no_years':df_trimmer.no_years(df), 
           'no_years_no_target': df_trimmer.no_years_no_target(df),
           'subtype':df_trimmer.conv_subt(df), 'subtype_no_targ':df_trimmer.subt_no_target(df), 
           'subt_no_tar_no_year':df_trimmer.subt_no_targ_no_years(df),
           'class':df_trimmer.conv_class(df), 'class_no_target':df_trimmer.class_no_target(df),
           'type':df_trimmer.conv_type(df), 'type_no_target':df_trimmer.type_no_target(df)}
best_mean_roc_auc = {}
roc_auc_list = []


In [8]:
best_mean_roc = {}

for key, value in df_dict.items():
    
    df = value
    X = df[0]
    y = df[1]
    best_mean_roc[key] = model_trainer(LogisticRegression(), X, y)

0.6259167683387536
0.6248704534789425
0.6379582801257285
0.6443278160165826
0.6002769169748345
0.5725938557208643
0.5933486277198468
0.6018820260385105
0.5739736503552244
0.6059579542159873
0.5865689904118204


In [9]:
best_mean_roc

{'all_features': 0.6259167683387536,
 'no_targ_pop': 0.6248704534789425,
 'no_years': 0.6379582801257285,
 'no_years_no_target': 0.6443278160165826,
 'subtype': 0.6002769169748345,
 'subtype_no_targ': 0.5725938557208643,
 'subt_no_tar_no_year': 0.5933486277198468,
 'class': 0.6018820260385105,
 'class_no_target': 0.5739736503552244,
 'type': 0.6059579542159873,
 'type_no_target': 0.5865689904118204}

### with no years no target

In [10]:
df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df[mask]

df = df_trimmer.no_years_no_target(df)
X = df[0]
y = df[1]

# C_range = np.linspace(1,20,50)
C_range = np.linspace(1,2,20)
best_C = 0
best_roc = 0
for C in C_range:
    model_roc = model_trainer(LogisticRegression(C=C), X, y)
    if model_roc > best_roc:
        best_C = C
        best_roc = model_roc
print(best_C, best_roc)

0.6443278160165826
0.6443236482259702
0.6443279691612015
0.6443358315845537
0.644342671034164
0.6443255680860364
0.6443315595413109
0.6443237212451052
0.64432148055841
0.6443263271461331
0.644332840938242
0.6443273325077964
0.6443256317647841
0.6443228751724331
0.6443223832139157
0.6443118389532796
0.6443021555757904
0.6442999363866675
0.6442930678663069
0.644294561083254
1.2105263157894737 0.644342671034164


In [11]:
print(best_C)

1.2105263157894737


# Knn

In [12]:
df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df[mask]
df_dict = {'all_features':df_trimmer.all_features(df),
           'no_targ_pop':df_trimmer.no_targ_pop(df),
           'no_years':df_trimmer.no_years(df), 
           'no_years_no_target': df_trimmer.no_years_no_target(df),
           'subtype':df_trimmer.conv_subt(df), 'subtype_no_targ':df_trimmer.subt_no_target(df), 
           'subt_no_tar_no_year':df_trimmer.subt_no_targ_no_years(df),
           'class':df_trimmer.conv_class(df), 'class_no_target':df_trimmer.class_no_target(df),
           'type':df_trimmer.conv_type(df), 'type_no_target':df_trimmer.type_no_target(df)}
best_mean_roc_auc = {}
roc_auc_list = []

In [26]:
best_mean_roc = {}

for key, value in df_dict.items():
    
    df = value
    X = df[0]
    y = df[1]
    best_mean_roc[key] = model_trainer(KNeighborsClassifier(), X, y)

KeyboardInterrupt: 

In [15]:
print(best_mean_roc)

{'all_features': 0.5354161341668107, 'no_targ_pop': 0.5284160872234279, 'no_years': 0.5522638525262046, 'no_years_no_target': 0.5561281759461746, 'subtype': 0.5294825870060366, 'subtype_no_targ': 0.5283930533722117, 'subt_no_tar_no_year': 0.5331259389853924, 'class': 0.5410128334160916, 'class_no_target': 0.5321189301405483, 'type': 0.539497805602971, 'type_no_target': 0.5377030937025655}


In [25]:
df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df[mask]

df = df_trimmer.no_years_no_target(df)
X = df[0]
y = df[1]

# C_range = np.linspace(1,20,50)
K_range = np.linspace(2,12,10)
best_K = 0
best_roc = 0
for K in range(12,16):
    model_roc = model_trainer(KNeighborsClassifier(n_neighbors=K), X, y)
    if model_roc > best_roc:
        best_K = K
        best_roc = model_roc
    print(K)
print(best_K, best_roc)

0.5691029524588626
12
0.5718331510788159
13
0.5732292842549969
14
0.5755092751995405
15
15 0.5755092751995405


# Naive-Bayes Bernouli

In [29]:
df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df[mask]
df_dict = {'all_features':df_trimmer.all_features(df),
           'no_targ_pop':df_trimmer.no_targ_pop(df),
           'no_years':df_trimmer.no_years(df), 
           'no_years_no_target': df_trimmer.no_years_no_target(df),
           'subtype':df_trimmer.conv_subt(df), 'subtype_no_targ':df_trimmer.subt_no_target(df), 
           'subt_no_tar_no_year':df_trimmer.subt_no_targ_no_years(df),
           'class':df_trimmer.conv_class(df), 'class_no_target':df_trimmer.class_no_target(df),
           'type':df_trimmer.conv_type(df), 'type_no_target':df_trimmer.type_no_target(df)}
best_mean_roc_auc = {}
roc_auc_list = []

In [30]:
best_mean_roc = {}

for key, value in df_dict.items():
    
    df = value
    X = df[0]
    y = df[1]
    best_mean_roc[key] = model_trainer(naive_bayes.BernoulliNB(), X, y)

0.6056066876918941
0.605280110219917
0.6254983141718499
0.6266309993864152
0.5933795318582685
0.5699823964756435
0.5917707345050085
0.5988526098424409
0.5724899123976942
0.60524009066357
0.5854191628758054


In [31]:
print(best_mean_roc)

{'all_features': 0.6056066876918941, 'no_targ_pop': 0.605280110219917, 'no_years': 0.6254983141718499, 'no_years_no_target': 0.6266309993864152, 'subtype': 0.5933795318582685, 'subtype_no_targ': 0.5699823964756435, 'subt_no_tar_no_year': 0.5917707345050085, 'class': 0.5988526098424409, 'class_no_target': 0.5724899123976942, 'type': 0.60524009066357, 'type_no_target': 0.5854191628758054}


# Random Forest

In [32]:
df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df[mask]
df_dict = {'all_features':df_trimmer.all_features(df),
           'no_targ_pop':df_trimmer.no_targ_pop(df),
           'no_years':df_trimmer.no_years(df), 
           'no_years_no_target': df_trimmer.no_years_no_target(df),
           'subtype':df_trimmer.conv_subt(df), 'subtype_no_targ':df_trimmer.subt_no_target(df), 
           'subt_no_tar_no_year':df_trimmer.subt_no_targ_no_years(df),
           'class':df_trimmer.conv_class(df), 'class_no_target':df_trimmer.class_no_target(df),
           'type':df_trimmer.conv_type(df), 'type_no_target':df_trimmer.type_no_target(df)}
best_mean_roc_auc = {}
roc_auc_list = []

In [41]:
best_mean_roc = {}
for depth in range(2,10):
    for key, value in df_dict.items():

        df = value
        X = df[0]
        y = df[1]
        dict_str = str(key)+str(depth)
        best_mean_roc[dict_str] = [model_trainer(RandomForestClassifier(max_depth=depth), X, y)]

0.5966010126956296
0.6015327858834154
0.6045550967607431
0.5995798013870897
0.5971718028835198
0.5500250765312408
0.5753479634812726
0.5935199238270876
0.5642603569418706
0.5998086858506093
0.57779522038374
0.6039068199273888
0.6043522445381673
0.6130081790817146
0.6100228425156236
0.6005477844922369
0.5611847189111394
0.5809431970277409
0.5990630240075212
0.571076397493634
0.6046760088811979
0.5848430272036687
0.6128204912183601
0.6090729762729976
0.6130541303283235
0.6128600336290108
0.6007288374226545
0.5674435927571201
0.5847148813312997
0.5984442738824575
0.5727707893022772
0.603676727764973
0.5891841275753886
0.6077474444046146
0.6137424218460538
0.6112186556272787
0.6175793584549873
0.591658281929837
0.5632007048051879
0.5894793434924857
0.596993605393153
0.5738205831793506
0.6036389751436106
0.5889233704491584
0.6186334131408536
0.6146041015955326
0.6165266037118019
0.6213273679485929
0.6018979517751786
0.5619975371864883
0.5914563501031935
0.5975668028867339
0.5721585316852453

In [42]:
print(best_mean_roc)

{'all_features2': [0.5966010126956296], 'no_targ_pop2': [0.6015327858834154], 'no_years2': [0.6045550967607431], 'no_years_no_target2': [0.5995798013870897], 'subtype2': [0.5971718028835198], 'subtype_no_targ2': [0.5500250765312408], 'subt_no_tar_no_year2': [0.5753479634812726], 'class2': [0.5935199238270876], 'class_no_target2': [0.5642603569418706], 'type2': [0.5998086858506093], 'type_no_target2': [0.57779522038374], 'all_features3': [0.6039068199273888], 'no_targ_pop3': [0.6043522445381673], 'no_years3': [0.6130081790817146], 'no_years_no_target3': [0.6100228425156236], 'subtype3': [0.6005477844922369], 'subtype_no_targ3': [0.5611847189111394], 'subt_no_tar_no_year3': [0.5809431970277409], 'class3': [0.5990630240075212], 'class_no_target3': [0.571076397493634], 'type3': [0.6046760088811979], 'type_no_target3': [0.5848430272036687], 'all_features4': [0.6128204912183601], 'no_targ_pop4': [0.6090729762729976], 'no_years4': [0.6130541303283235], 'no_years_no_target4': [0.61286003362901

In [45]:
max_rf = 0
max_dep_comb = 0
for key, value in best_mean_roc.items():
    if best_mean_roc[key][0] > max_rf:
        max_rf = value
        max_dep_comb = key
print(max_rf, max_dep_comb)

[0.6275627952126596] no_years_no_target8


# SVM

In [51]:
def model_trainer_for_svm(model_function, X, y):
    roc_auc_list = []
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state=42)

    kf = StratifiedKFold(n_splits=5, random_state=42)

    for train_ind, val_ind in kf.split(X_train, y_train):

        X_ktrain, X_kval = X.iloc[train_ind], X.iloc[val_ind]
        y_ktrain, y_kval = y.iloc[train_ind], y.iloc[val_ind]

        ros = RandomOverSampler(random_state=42)
        X_train_resampled, y_train_resampled = ros.fit_resample(X_ktrain, y_ktrain)

        reg = model_function
        reg.fit(X_train_resampled,y_train_resampled)
        y_pred = reg.predict(X_kval)
        score = roc_auc_score(y_kval, y_pred)

        roc_auc_list.append(score)

    mean_roc_auc = sum(roc_auc_list)/len(roc_auc_list)

    #print(sum(roc_auc_list)/len(roc_auc_list))
    print(mean_roc_auc)
    return mean_roc_auc

In [52]:
df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df[mask]
df_dict = {'all_features':df_trimmer.all_features(df),
           'no_targ_pop':df_trimmer.no_targ_pop(df),
           'no_years':df_trimmer.no_years(df), 
           'no_years_no_target': df_trimmer.no_years_no_target(df),
           'subtype':df_trimmer.conv_subt(df), 'subtype_no_targ':df_trimmer.subt_no_target(df), 
           'subt_no_tar_no_year':df_trimmer.subt_no_targ_no_years(df),
           'class':df_trimmer.conv_class(df), 'class_no_target':df_trimmer.class_no_target(df),
           'type':df_trimmer.conv_type(df), 'type_no_target':df_trimmer.type_no_target(df)}
best_mean_roc_auc = {}
roc_auc_list = []

In [54]:
best_mean_roc = {}

df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df_trimmer.no_years_no_target(df)
X = df[0]
y = df[1]
best_mean_roc[key] = model_trainer_for_svm(svm.SVC(kernel='linear'), X, y)

0.5810982612300724


In [55]:
best_mean_roc = {}

df = pickle.load(open('../data/pickles/df_full.pickle', 'rb'))
mask =[feature  for feature in list(df) if df[feature].sum() > 50]
df = df_trimmer.no_years_no_target(df)
X = df[0]
y = df[1]
best_mean_roc[key] = model_trainer_for_svm(svm.SVC(kernel='rbf', gamma=.001), X, y)

0.5669331200423507
