In [14]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

In [2]:
data_directory = os.path.join(os.getcwd(), 'dataset_diabetes')
diabetes = pd.read_csv(os.path.join(data_directory, 'diabetic_data.csv'))


In [3]:
diabetes.head(20)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),?,2,1,2,3,...,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),?,3,1,2,4,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),?,1,1,7,5,...,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),?,2,1,4,13,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),?,3,3,4,12,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
diabetes1 = diabetes.copy()
diabetes1.head(20)
diabetes1.readmitted = diabetes1.readmitted.map({ 'NO' : 0, '<30' : 1, '>30': 0})
diabetes1.age = diabetes1.age.map({ '[0-10)' : "0-30",'[10-20)': '0-30', "[20-30)": '0-30',"[30-40)":'30-60', "[40-50)": '30-60', "[50-60)": '30-60', "[60-70)": '60-100', "[70-80)": '60-100', "[80-90)": '60-100', "[90-100)": '60-100'})


In [5]:
#removing 'examide' and 'citoglipton' because they are not informative
diabetes_2 = diabetes1.drop(columns=['examide', 'citoglipton', 'medical_specialty', 'number_outpatient','number_inpatient','payer_code','weight', 'encounter_id', 'patient_nbr'])

In [6]:
admission_dict = {1: 'Emergency', 2: 'Urgent', 3: 'Elective', 4: 'Newborn', 7: 'Trauma Center'}
def map_admission(x):
    if x in [5,6,8]:
        return 'Not Available'
    else:
        return admission_dict[x]

    
diabetes_2['admission_type_processed'] = diabetes_2.apply(lambda x: map_admission(x['admission_type_id']), axis=1)

def discharged_home(x):
    if x == 1:
        return 1 # Discharged home
    else:
        return 0 # Other

diabetes_2['discharged_processed'] = diabetes_2.apply(lambda x: discharged_home(x['discharge_disposition_id']), axis=1)

def map_source(x):
    if x in [1,2,3]:
        return 'Referral' 
    elif x == 7:
        return 'Emergency Room'
    else:
        return 'Other'
    
diabetes_2['source_processed'] = diabetes_2.apply(lambda x: map_source(x['admission_source_id']), axis=1)


def map_diag(x):
    if x[0].isdigit() == False:
        return 'other'
    
    float_x = float(x)
    if (float_x >= 390 and float_x <= 459) or float_x == 785: # 390–459, 785
        return 'circulatory'
    elif (float_x >= 460 and float_x <= 519) or float_x == 786: # 460–519, 786
        return 'respiratory'
    elif (float_x >= 520 and float_x <= 579) or float_x == 787: # 520–579, 787
        return 'digestive'
    elif float_x >= 250 and float_x < 251: # 250.xx
        return 'diabetes'
    elif float_x >= 800 and float_x <= 999: # 800–999
        return 'injury'
    elif float_x >= 710 and float_x <= 739: # 710–739
        return 'musculoskeletal'
    elif (float_x >= 580 and float_x <= 629) or float_x == 788: # 580–629, 788
        return 'genitourinary'
    elif float_x >= 140 and float_x <= 239: # 140–239
        return 'neoplasms'
    else:
        return 'other'

diabetes_2['diag1_processed'] = diabetes_2.apply(lambda x: map_diag(x['diag_1']), axis=1)
diabetes_2['diag2_processed'] = diabetes_2.apply(lambda x: map_diag(x['diag_2']), axis=1)
diabetes_2['diag3_processed'] = diabetes_2.apply(lambda x: map_diag(x['diag_3']), axis = 1)
                                            



In [7]:
def binarize_yn(x):
    if x in ['Yes', 'Ch']: 
        return 1 
    else:
        return 0
diabetes_2['change_processed'] = diabetes_2.apply(lambda x: binarize_yn(x['change']), axis=1)

diabetes_2['diabetesMed_processed'] = diabetes_2.apply(lambda x: binarize_yn(x['diabetesMed']), axis=1)

In [8]:
diabetes_2.head()
for col in diabetes_2.columns: 
    print(col)
#df_cat = diabetes_2[['race', 'gender', 'age', 'admission_type_id','discharged_processed', 'change_processed', 'diag1_processed''diag2_processed'

df_cat = diabetes_2[['admission_type_processed', 'source_processed',
            'diag1_processed', 'diag2_processed', 'diag3_processed', 'race', 'gender', 'age', 
            'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 
            'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 
            'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 
            'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 
            'glimepiride-pioglitazone', 'metformin-rosiglitazone','metformin-pioglitazone', 'readmitted']]                     

race
gender
age
admission_type_id
discharge_disposition_id
admission_source_id
time_in_hospital
num_lab_procedures
num_procedures
num_medications
number_emergency
diag_1
diag_2
diag_3
number_diagnoses
max_glu_serum
A1Cresult
metformin
repaglinide
nateglinide
chlorpropamide
glimepiride
acetohexamide
glipizide
glyburide
tolbutamide
pioglitazone
rosiglitazone
acarbose
miglitol
troglitazone
tolazamide
insulin
glyburide-metformin
glipizide-metformin
glimepiride-pioglitazone
metformin-rosiglitazone
metformin-pioglitazone
change
diabetesMed
readmitted
admission_type_processed
discharged_processed
source_processed
diag1_processed
diag2_processed
diag3_processed
change_processed
diabetesMed_processed


In [9]:
for col in df_cat.columns:
    print(col)

admission_type_processed
source_processed
diag1_processed
diag2_processed
diag3_processed
race
gender
age
max_glu_serum
A1Cresult
metformin
repaglinide
nateglinide
chlorpropamide
glimepiride
acetohexamide
glipizide
glyburide
tolbutamide
pioglitazone
rosiglitazone
acarbose
miglitol
troglitazone
tolazamide
insulin
glyburide-metformin
glipizide-metformin
glimepiride-pioglitazone
metformin-rosiglitazone
metformin-pioglitazone
readmitted


In [10]:
#diabetes_2["age"] = pd.to_numeric(diabetes_2["age"])
diabetes_1H = pd.get_dummies(df_cat, drop_first=False)
for col in diabetes_1H.columns: 
    print(col) 
pd.set_option('max_columns', None)
diabetes_1H.head()

readmitted
admission_type_processed_Elective
admission_type_processed_Emergency
admission_type_processed_Newborn
admission_type_processed_Not Available
admission_type_processed_Trauma Center
admission_type_processed_Urgent
source_processed_Emergency Room
source_processed_Other
source_processed_Referral
diag1_processed_circulatory
diag1_processed_diabetes
diag1_processed_digestive
diag1_processed_genitourinary
diag1_processed_injury
diag1_processed_musculoskeletal
diag1_processed_neoplasms
diag1_processed_other
diag1_processed_respiratory
diag2_processed_circulatory
diag2_processed_diabetes
diag2_processed_digestive
diag2_processed_genitourinary
diag2_processed_injury
diag2_processed_musculoskeletal
diag2_processed_neoplasms
diag2_processed_other
diag2_processed_respiratory
diag3_processed_circulatory
diag3_processed_diabetes
diag3_processed_digestive
diag3_processed_genitourinary
diag3_processed_injury
diag3_processed_musculoskeletal
diag3_processed_neoplasms
diag3_processed_other
diag

Unnamed: 0,readmitted,admission_type_processed_Elective,admission_type_processed_Emergency,admission_type_processed_Newborn,admission_type_processed_Not Available,admission_type_processed_Trauma Center,admission_type_processed_Urgent,source_processed_Emergency Room,source_processed_Other,source_processed_Referral,diag1_processed_circulatory,diag1_processed_diabetes,diag1_processed_digestive,diag1_processed_genitourinary,diag1_processed_injury,diag1_processed_musculoskeletal,diag1_processed_neoplasms,diag1_processed_other,diag1_processed_respiratory,diag2_processed_circulatory,diag2_processed_diabetes,diag2_processed_digestive,diag2_processed_genitourinary,diag2_processed_injury,diag2_processed_musculoskeletal,diag2_processed_neoplasms,diag2_processed_other,diag2_processed_respiratory,diag3_processed_circulatory,diag3_processed_diabetes,diag3_processed_digestive,diag3_processed_genitourinary,diag3_processed_injury,diag3_processed_musculoskeletal,diag3_processed_neoplasms,diag3_processed_other,diag3_processed_respiratory,race_?,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,gender_Unknown/Invalid,age_0-30,age_30-60,age_60-100,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,metformin_Down,metformin_No,metformin_Steady,metformin_Up,repaglinide_Down,repaglinide_No,repaglinide_Steady,repaglinide_Up,nateglinide_Down,nateglinide_No,nateglinide_Steady,nateglinide_Up,chlorpropamide_Down,chlorpropamide_No,chlorpropamide_Steady,chlorpropamide_Up,glimepiride_Down,glimepiride_No,glimepiride_Steady,glimepiride_Up,acetohexamide_No,acetohexamide_Steady,glipizide_Down,glipizide_No,glipizide_Steady,glipizide_Up,glyburide_Down,glyburide_No,glyburide_Steady,glyburide_Up,tolbutamide_No,tolbutamide_Steady,pioglitazone_Down,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,rosiglitazone_Down,rosiglitazone_No,rosiglitazone_Steady,rosiglitazone_Up,acarbose_Down,acarbose_No,acarbose_Steady,acarbose_Up,miglitol_Down,miglitol_No,miglitol_Steady,miglitol_Up,troglitazone_No,troglitazone_Steady,tolazamide_No,tolazamide_Steady,tolazamide_Up,insulin_Down,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_No,glipizide-metformin_Steady,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady
0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0
1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,1,0
2,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0
3,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,1,0
4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0


In [11]:
train, test = np.split(diabetes_1H.sample(frac=1), [int(8.*len(diabetes_1H))])

In [12]:
Xtr = train.drop(['readmitted'], axis=1)
Ytr = train['readmitted']
Xte = test.drop(['readmitted'], axis=1)
Yte = test['readmitted']
#Xval= validate.drop(['readmitted'], axis=1)
#Yval = validate['readmitted']
Xtr.head()

Unnamed: 0,admission_type_processed_Elective,admission_type_processed_Emergency,admission_type_processed_Newborn,admission_type_processed_Not Available,admission_type_processed_Trauma Center,admission_type_processed_Urgent,source_processed_Emergency Room,source_processed_Other,source_processed_Referral,diag1_processed_circulatory,diag1_processed_diabetes,diag1_processed_digestive,diag1_processed_genitourinary,diag1_processed_injury,diag1_processed_musculoskeletal,diag1_processed_neoplasms,diag1_processed_other,diag1_processed_respiratory,diag2_processed_circulatory,diag2_processed_diabetes,diag2_processed_digestive,diag2_processed_genitourinary,diag2_processed_injury,diag2_processed_musculoskeletal,diag2_processed_neoplasms,diag2_processed_other,diag2_processed_respiratory,diag3_processed_circulatory,diag3_processed_diabetes,diag3_processed_digestive,diag3_processed_genitourinary,diag3_processed_injury,diag3_processed_musculoskeletal,diag3_processed_neoplasms,diag3_processed_other,diag3_processed_respiratory,race_?,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,gender_Unknown/Invalid,age_0-30,age_30-60,age_60-100,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,metformin_Down,metformin_No,metformin_Steady,metformin_Up,repaglinide_Down,repaglinide_No,repaglinide_Steady,repaglinide_Up,nateglinide_Down,nateglinide_No,nateglinide_Steady,nateglinide_Up,chlorpropamide_Down,chlorpropamide_No,chlorpropamide_Steady,chlorpropamide_Up,glimepiride_Down,glimepiride_No,glimepiride_Steady,glimepiride_Up,acetohexamide_No,acetohexamide_Steady,glipizide_Down,glipizide_No,glipizide_Steady,glipizide_Up,glyburide_Down,glyburide_No,glyburide_Steady,glyburide_Up,tolbutamide_No,tolbutamide_Steady,pioglitazone_Down,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,rosiglitazone_Down,rosiglitazone_No,rosiglitazone_Steady,rosiglitazone_Up,acarbose_Down,acarbose_No,acarbose_Steady,acarbose_Up,miglitol_Down,miglitol_No,miglitol_Steady,miglitol_Up,troglitazone_No,troglitazone_Steady,tolazamide_No,tolazamide_Steady,tolazamide_Up,insulin_Down,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_No,glipizide-metformin_Steady,glimepiride-pioglitazone_No,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_No,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady
52993,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0
12762,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0
50391,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0
55756,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0
96884,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0


In [None]:
smote = SMOTE(random_state=0)
Xtr_smote, Ytr_smote = smote.fit_sample(Xtr, Ytr)

In [None]:
Ytr_smote.value_counts()

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
#Balanced class weight, penalty = l1
reg = LogisticRegression(fit_intercept = True,class_weight= 'balanced', penalty = 'l1', solver='liblinear').fit(Xtr, Ytr)

In [None]:
print(cross_val_score(reg, Xtr, Ytr, cv = 4, scoring = 'roc_auc'))

In [None]:
#Balanced class weights, penalty = l2
reg_l2_balanced = LogisticRegression(fit_intercept = True,class_weight= 'balanced').fit(Xtr, Ytr)
print(cross_val_score(reg_l2_balanced, Xtr, Ytr, cv = 4, scoring = 'roc_auc'))

In [None]:
#Non-balanced classes, l1 penalty
reg_l1_nb = LogisticRegression(fit_intercept = True, penalty = 'l1', solver='liblinear').fit(Xtr, Ytr)
print(cross_val_score(reg_l1_nb, Xtr, Ytr, cv = 4, scoring = 'roc_auc'))

In [None]:
#Non-balanced classes, l1 penalty, using SMOTE
reg_l1_nb_smote = LogisticRegression(fit_intercept = True, penalty = 'l1', solver='liblinear').fit(Xtr_smote, Ytr_smote)
print(cross_val_score(reg_l1_nb_smote, Xtr, Ytr, cv = 4, scoring = 'roc_auc'))

In [None]:
#l2, non-balanced
reg_l2_nb = LogisticRegression(fit_intercept = True).fit(Xtr, Ytr)
print(cross_val_score(reg_l2_nb, Xtr, Ytr, cv = 4, scoring = 'roc_auc'))

In [None]:

model = Pipeline([
        ('sampling', SMOTE()),
        ('model', LogisticRegression())
    ])
model.get_params().keys()

In [None]:
#This is using oversampled data
penalty = ['l1', 'l2']
c = np.logspace(0, 3, 10)
print(c), 
#hyperp = dict(C = c, penalty = penalty) 
param_grid = [
  {'model__penalty': ['l1'], 'model__solver': ['liblinear'], 'model__C' : c, 'model__class_weight': ['balanced', None]},
  {'model__penalty': ['l2'], 'model__solver': ['lbfgs'], 'model__class_weight': ['balanced', None], 'model__C' : c},]
print(hyperp)
clf = GridSearchCV(model, param_grid, cv = 3, verbose = 5, scoring= 'roc_auc')
best_model = clf.fit(Xtr, Ytr)
best_model.best_estimator_.get_params()['penalty']

In [20]:
#Multi-layer perceptron
from sklearn.neural_network import MLPClassifier
model = Pipeline([                              #('sampling', SMOTE()),
        ('model', MLPClassifier())
    ])
param_grid={
'model__learning_rate': ["constant"],
'model__hidden_layer_sizes': [(40, 20), (25, 20), (30, 25, 15)], #, (30, 10), (20,20,20), (25, 20, 15), (40, 25, 10)
'model__alpha': [.015], #between .01 and .5 is best. .1 seemed best after trying .01 to 1, started getting worse at .5 
'model__activation': ["tanh"]
}

clf = GridSearchCV(model, param_grid, cv = 3, verbose = 5, scoring= 'roc_auc')
best_model = clf.fit(Xtr, Ytr)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(40, 20), model__learning_rate=constant 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(40, 20), model__learning_rate=constant, score=0.549, total= 2.2min
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(40, 20), model__learning_rate=constant 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min remaining:    0.0s


[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(40, 20), model__learning_rate=constant, score=0.541, total= 1.8min
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(40, 20), model__learning_rate=constant 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.0min remaining:    0.0s


[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(40, 20), model__learning_rate=constant, score=0.549, total= 2.5min
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(25, 20), model__learning_rate=constant 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  6.5min remaining:    0.0s


[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(25, 20), model__learning_rate=constant, score=0.544, total= 1.3min
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(25, 20), model__learning_rate=constant 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  7.8min remaining:    0.0s


[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(25, 20), model__learning_rate=constant, score=0.546, total= 1.0min
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(25, 20), model__learning_rate=constant 




[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(25, 20), model__learning_rate=constant, score=0.559, total=  24.5s
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(30, 25, 15), model__learning_rate=constant 




[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(30, 25, 15), model__learning_rate=constant, score=0.540, total= 3.0min
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(30, 25, 15), model__learning_rate=constant 
[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(30, 25, 15), model__learning_rate=constant, score=0.545, total= 2.8min
[CV] model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(30, 25, 15), model__learning_rate=constant 
[CV]  model__activation=tanh, model__alpha=0.015, model__hidden_layer_sizes=(30, 25, 15), model__learning_rate=constant, score=0.551, total= 2.4min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 17.4min finished


In [None]:
clf = MLPClassifier(random_state=1, max_iter=300).fit(Xtr, Ytr)
print(cross_val_score(clf, Xtr, Ytr, cv = 4, scoring = 'roc_auc'))