In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder

from collections import Counter

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
random_state = 42

In [4]:
directory = os.path.join(os.getcwd(), 'dataset_diabetes')
training_set = pd.read_csv(os.path.join(directory, 'diabetic_data_train.csv'))
df = training_set.copy()

In [5]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,81844290,94788,Caucasian,Female,[70-80),?,1,1,7,4,?,InternalMedicine,48,0,11,0,0,0,276,402,428,9,,Norm,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,396159158,135023315,Caucasian,Male,[50-60),?,1,1,7,1,BC,?,42,0,5,0,0,0,427,250,278,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,>30
2,31258956,18397782,Caucasian,Male,[80-90),?,1,1,7,4,?,?,44,0,10,0,0,0,599,788,599,7,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,210691074,67509558,Caucasian,Male,[80-90),?,1,3,7,3,MC,?,54,0,8,0,0,0,331,309,331,8,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
4,104902980,23272362,AfricanAmerican,Female,[70-80),?,1,11,7,11,MC,Nephrology,35,3,23,0,0,1,38,486,403,8,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO


In [7]:
df.drop(['encounter_id', 'patient_nbr', 'payer_code', 'weight'], axis=1, inplace=True)

In [8]:
for col in df.columns:
  print(df.groupby(col)[col].value_counts())

race             race           
?                ?                   1824
AfricanAmerican  AfricanAmerican    15325
Asian            Asian                517
Caucasian        Caucasian          60871
Hispanic         Hispanic            1650
Other            Other               1225
Name: race, dtype: int64
gender           gender         
Female           Female             43740
Male             Male               37670
Unknown/Invalid  Unknown/Invalid        2
Name: gender, dtype: int64
age       age     
[0-10)    [0-10)        131
[10-20)   [10-20)       551
[20-30)   [20-30)      1335
[30-40)   [30-40)      3030
[40-50)   [40-50)      7810
[50-60)   [50-60)     13741
[60-70)   [60-70)     17959
[70-80)   [70-80)     20879
[80-90)   [80-90)     13737
[90-100)  [90-100)     2239
Name: age, dtype: int64
admission_type_id  admission_type_id
1                  1                    43218
2                  2                    14845
3                  3                    15052
4     

In [9]:
df.drop(['examide', 'citoglipton'], axis=1, inplace=True)

In [10]:
def binarize_readmitted(x):
    if x == '<30': # readmitted in less than 30 days is positive class
        return 1 
    else:
        return 0 

In [11]:
y = df.apply(lambda x: binarize_readmitted(x['readmitted']), axis=1).values

In [12]:
def map_age(x):
    if x == '[0-10)':
        return 5
    elif x == '[10-20)':
        return 15
    elif x == '[20-30)':
        return 25
    elif x == '[30-40)':
        return 35
    elif x == '[40-50)':
        return 45
    elif x == '[50-60)':
        return 55
    elif x == '[60-70)':
        return 65
    elif x == '[70-80)':
        return 75
    elif x == '[80-90)':
        return 85
    else:
        return 95

In [13]:
df['age_processed'] = df.apply(lambda x: map_age(x['age']), axis=1)

In [14]:
admission_dict = {1: 'Emergency', 2: 'Urgent', 3: 'Elective', 4: 'Newborn', 7: 'Trauma Center'}
def map_admission(x):
    if x in [5,6,8]:
        return 'Not Available'
    else:
        return admission_dict[x]

In [15]:
df['admission_type_processed'] = df.apply(lambda x: map_admission(x['admission_type_id']), axis=1)

In [16]:
def discharged_home(x):
    if x == 1:
        return 1 # Discharged home
    else:
        return 0 # Other 

In [17]:
df['discharged_processed'] = df.apply(lambda x: discharged_home(x['discharge_disposition_id']), axis=1)

In [18]:
def map_source(x):
    if x in [1,2,3]:
        return 'Referral' 
    elif x == 7:
        return 'Emergency Room'
    else:
        return 'Other'

In [19]:
df['source_processed'] = df.apply(lambda x: map_source(x['admission_source_id']), axis=1)

In [20]:
def map_diag(x):
    if x[0].isdigit() == False:
        return 'other'
    
    float_x = float(x)
    if (float_x >= 390 and float_x <= 459) or float_x == 785: # 390–459, 785
        return 'circulatory'
    elif (float_x >= 460 and float_x <= 519) or float_x == 786: # 460–519, 786
        return 'respiratory'
    elif (float_x >= 520 and float_x <= 579) or float_x == 787: # 520–579, 787
        return 'digestive'
    elif float_x >= 250 and float_x < 251: # 250.xx
        return 'diabetes'
    elif float_x >= 800 and float_x <= 999: # 800–999
        return 'injury'
    elif float_x >= 710 and float_x <= 739: # 710–739
        return 'musculoskeletal'
    elif (float_x >= 580 and float_x <= 629) or float_x == 788: # 580–629, 788
        return 'genitourinary'
    elif float_x >= 140 and float_x <= 239: # 140–239
        return 'neoplasms'
    else:
        return 'other'

In [21]:
df['diag1_processed'] = df.apply(lambda x: map_diag(x['diag_1']), axis=1)
df['diag2_processed'] = df.apply(lambda x: map_diag(x['diag_2']), axis=1)
df['diag3_processed'] = df.apply(lambda x: map_diag(x['diag_3']), axis=1)

In [22]:
df.groupby('medical_specialty')['medical_specialty'].value_counts()

medical_specialty                     medical_specialty                   
?                                     ?                                       39969
AllergyandImmunology                  AllergyandImmunology                        4
Anesthesiology                        Anesthesiology                              6
Anesthesiology-Pediatric              Anesthesiology-Pediatric                   13
Cardiology                            Cardiology                               4283
Cardiology-Pediatric                  Cardiology-Pediatric                        5
DCPTEAM                               DCPTEAM                                     4
Dentistry                             Dentistry                                   4
Dermatology                           Dermatology                                 1
Emergency/Trauma                      Emergency/Trauma                         6043
Endocrinology                         Endocrinology                              97
E

In [23]:
def map_medical_specialty(x):
  if 'Anesthesiology' in x:
    return 'Anesthesiology'
  elif 'Cardiology' in x:
    return 'Cardiology'
  elif 'Endocrinology' in x:
    return 'Endocrinology'
  elif 'Hematology' in x:
    return 'Hematology'
  elif 'Obstetrics' in x:
    return 'Obstetrics'
  elif 'Orthopedics' in x:
    return 'Orthopedics'
  elif 'Pediatrics' in x:
    return 'Pediatrics'
  elif 'Psychiatry' in x:
    return 'Psychiatry'
  elif 'Radiolog' in x:
    return 'Radiology'
  elif 'Surg' in x:
    return 'Surgery'
  else:
    return x

In [24]:
df['medical_specialty_processed'] = df.apply(lambda x: map_medical_specialty(x['medical_specialty']), axis=1)

In [25]:
def binarize_yn(x):
    if x in ['Yes', 'Ch']: 
        return 1 
    else:
        return 0 

In [26]:
df['change_processed'] = df.apply(lambda x: binarize_yn(x['change']), axis=1)

In [27]:
df['diabetesMed_processed'] = df.apply(lambda x: binarize_yn(x['diabetesMed']), axis=1)

In [28]:
ohe = OneHotEncoder(categories='auto')

In [29]:
df_cat = df[['admission_type_processed', 'source_processed', 'medical_specialty_processed',
            'diag1_processed', 'diag2_processed', 'diag3_processed', 'race', 'gender', 
            'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 
            'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 
            'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 
            'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 
            'glimepiride-pioglitazone', 'metformin-rosiglitazone','metformin-pioglitazone']]

In [30]:
ohe_arr = ohe.fit_transform(df_cat).toarray()

In [31]:
ohe_labels = ohe.categories_

In [32]:
feature_labels = []
for col, values in zip(df_cat.columns, ohe_labels):
    for val in values:
        feature_labels.append(col+'_'+val)

In [33]:
df_ohe = pd.DataFrame(ohe_arr, columns = feature_labels)

In [34]:
df_num = df[['age_processed', 'discharged_processed', 'change_processed', 'diabetesMed_processed', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']]

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_num_scaled = scaler.fit_transform(df_num.astype(np.float64))

In [36]:
df_num_scaled = pd.DataFrame(df_num_scaled, index=df_num.index, columns=df_num.columns)

In [37]:
X = pd.concat([df_ohe, df_num_scaled], axis=1)

In [38]:
X.shape, y.shape

((81412, 177), (81412,))

In [39]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier

In [40]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline



In [41]:
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.5)

In [42]:
Counter(y)

Counter({0: 72340, 1: 9072})

In [43]:
X_, y_ = over.fit_resample(X, y)



In [44]:
Counter(y_)

Counter({0: 72340, 1: 36170})

In [45]:
under = RandomUnderSampler(sampling_strategy=0.6)

In [46]:
_X, _y = under.fit_resample(X_, y_)



In [47]:
Counter(_y)

Counter({0: 60283, 1: 36170})

In [48]:
sgd_clf = SGDClassifier()

In [49]:
cross_val_score(sgd_clf, X, y, cv=3, scoring='accuracy') # original

array([0.88856953, 0.88856543, 0.88856543])

In [50]:
cross_val_score(sgd_clf, X_, y_, cv=3, scoring='accuracy') # oversampled minority class

array([0.67777163, 0.67475809, 0.67846281])

In [51]:
cross_val_score(sgd_clf, _X, _y, cv=3, scoring='accuracy') # undersampled majority + oversampled minority

array([0.64837797, 0.64495661, 0.64365028])

In [52]:
cross_val_score(sgd_clf, X, y, cv=3, scoring='roc_auc') # original 

array([0.5486495 , 0.52585486, 0.56378633])

In [53]:
cross_val_score(sgd_clf, X_, y_, cv=3, scoring='roc_auc') # oversampled minority class

array([0.64379412, 0.65526393, 0.65267967])

In [54]:
cross_val_score(sgd_clf, _X, _y, cv=3, scoring='roc_auc') # under major + over minor

array([0.63947867, 0.63983607, 0.65534625])

In [55]:
cross_val_score(sgd_clf, X, y, cv=3, scoring='recall') # original 

array([0.01455026, 0.        , 0.00033069])

In [56]:
cross_val_score(sgd_clf, X_, y_, cv=3, scoring='recall') # over minor 

array([0.04346384, 0.05291532, 0.07398192])

In [57]:
cross_val_score(sgd_clf, _X, _y, cv=3, scoring='recall') # under major + over minor 

array([0.1117286 , 0.11246579, 0.13751348])

In [58]:
cross_val_score(sgd_clf, X, y, cv=3, scoring='f1_weighted') # original 

array([0.83610492, 0.83613573, 0.83613573])

In [59]:
cross_val_score(sgd_clf, X_, y_, cv=3, scoring='f1_weighted') # over minor 

array([0.59118438, 0.57099142, 0.61106027])

In [60]:
cross_val_score(sgd_clf, _X, _y, cv=3, scoring='f1_weighted') # under major + over minor 

array([0.56847199, 0.57253644, 0.56139416])

In [61]:
over = SMOTE(sampling_strategy=0.75)

In [62]:
X__,  y__ = over.fit_resample(X, y)



In [63]:
Counter(y__)

Counter({0: 72340, 1: 54255})

In [64]:
cross_val_score(sgd_clf, X__, y__, cv=3, scoring='roc_auc')

array([0.64936189, 0.66974994, 0.66253392])

In [65]:
over = SMOTE(sampling_strategy=0.9)

In [66]:
_X__,  _y__ = over.fit_resample(X, y)



In [67]:
cross_val_score(sgd_clf, _X__, _y__, cv=3, scoring='roc_auc')

array([0.64693372, 0.66628139, 0.66517152])

In [68]:
cross_val_score(sgd_clf, _X__, _y__, cv=3, scoring='recall')

array([0.60459865, 0.44673302, 0.43272509])

In [69]:
cross_val_score(sgd_clf, _X__, _y__, cv=3, scoring='f1_weighted')

array([0.60281533, 0.61608688, 0.60654061])

In [70]:
cross_val_score(sgd_clf, X__, y__, cv=3, scoring='recall')

array([0.43787669, 0.45247443, 0.34553497])

In [71]:
cross_val_score(sgd_clf, X__, y__, cv=3, scoring='f1_weighted')

array([0.60746427, 0.57114531, 0.61885063])

In [72]:
# How about dramatically oversampling the minority class? 
over = SMOTE(sampling_strategy=1)

In [73]:
dramatic_X, dramatic_y = over.fit_resample(X, y)



In [74]:
Counter(dramatic_y)

Counter({0: 72340, 1: 72340})

In [75]:
cross_val_score(sgd_clf, dramatic_X, dramatic_y, cv=3, scoring='roc_auc')

array([0.6553182 , 0.66897189, 0.66408651])

In [76]:
cross_val_score(sgd_clf, X__, y__, cv=3, scoring='recall')

array([0.38269284, 0.1532762 , 0.2656345 ])

In [77]:
cross_val_score(sgd_clf, X__, y__, cv=3, scoring='f1_weighted')

array([0.58341832, 0.61048563, 0.54614078])

In [78]:
from sklearn.linear_model import LogisticRegression

In [79]:
lg_clf = LogisticRegression()

In [80]:
cross_val_score(lg_clf, X, y, cv=3, scoring='accuracy')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.88868008, 0.88897078, 0.88782843])

In [81]:
cross_val_score(lg_clf, X_, y_, cv=3, scoring='accuracy')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.67846281, 0.67970694, 0.67696986])

In [82]:
cross_val_score(lg_clf, X, y, cv=3, scoring='roc_auc')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.64623063, 0.64420123, 0.63615024])

In [83]:
cross_val_score(lg_clf, X_, y_, cv=3, scoring='roc_auc')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.64259837, 0.67617849, 0.6680559 ])

In [84]:
cross_val_score(lg_clf, _X, _y, cv=3, scoring='roc_auc')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.64185862, 0.67563223, 0.66936348])

In [85]:
cross_val_score(lg_clf, X, y, cv=3, scoring='recall')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.01851852, 0.01322751, 0.0135582 ])

In [86]:
cross_val_score(lg_clf, X_, y_, cv=3, scoring='recall')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.17626078, 0.15617484, 0.15965829])

In [87]:
cross_val_score(lg_clf, _X, _y, cv=3, scoring='recall')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.25705043, 0.2372066 , 0.23480136])

In [88]:
cross_val_score(lg_clf, X, y, cv=3, scoring='f1_weighted')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.8400655 , 0.83913826, 0.83860684])

In [89]:
cross_val_score(lg_clf, X_, y_, cv=3, scoring='f1_weighted')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.61856276, 0.6129137 , 0.61215445])

In [90]:
cross_val_score(lg_clf, _X, _y, cv=3, scoring='f1_weighted')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.6062896 , 0.60308077, 0.60043932])

In [91]:
cross_val_score(lg_clf, dramatic_X, dramatic_y, cv=3, scoring='accuracy')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.61202231, 0.61983951, 0.62026708])

In [92]:
cross_val_score(lg_clf, dramatic_X, dramatic_y, cv=3, scoring='roc_auc')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.65838065, 0.67202505, 0.6691857 ])

In [93]:
cross_val_score(lg_clf, dramatic_X, dramatic_y, cv=3, scoring='recall')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.58333679, 0.59961018, 0.5976859 ])

In [94]:
cross_val_score(lg_clf, dramatic_X, dramatic_y, cv=3, scoring='f1_weighted')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.61170276, 0.61968391, 0.62007335])

In [101]:
from sklearn.neural_network import MLPClassifier

In [102]:
mlp_clf = MLPClassifier()

In [103]:
cross_val_score(mlp_clf, X, y, cv=3, scoring='accuracy')



array([0.86034343, 0.84220806, 0.86258614])

In [104]:
cross_val_score(mlp_clf, X_, y_, cv=3, scoring='accuracy')



array([0.75236384, 0.8835499 , 0.88426873])

In [105]:
cross_val_score(mlp_clf, dramatic_X, dramatic_y, cv=3, scoring='accuracy')



array([0.78642669, 0.89451967, 0.87799113])

In [106]:
cross_val_score(mlp_clf, X_, y_, cv=3, scoring='roc_auc')



array([0.75716185, 0.95687755, 0.95287017])

In [107]:
cross_val_score(mlp_clf, dramatic_X, dramatic_y, cv=3, scoring='roc_auc')



array([0.87619674, 0.97178704, 0.96948178])

In [108]:
cross_val_score(mlp_clf, X_, y_, cv=3, scoring='recall')



array([0.4225282 , 0.92435929, 0.87376628])

In [109]:
cross_val_score(mlp_clf, dramatic_X, dramatic_y, cv=3, scoring='recall')



array([0.64649774, 0.94579912, 0.94753867])

In [110]:
cross_val_score(mlp_clf, X_, y_, cv=3, scoring='f1_weighted')



array([0.72811649, 0.90058625, 0.88134408])

In [111]:
cross_val_score(mlp_clf, dramatic_X, dramatic_y, cv=3, scoring='f1_weighted')



array([0.79870031, 0.90731377, 0.90453759])