In [None]:
#cd /Users/akshitasingh/Downloads/273A_ML/1_MLProject

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(0)

from collections import defaultdict

np.random.seed(100)

In [None]:
# sklearn imports
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn import datasets
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

# other stats/math imports
import math
from scipy.stats import chi2_contingency

In [None]:
#diabetes = pd.read_csv("/Users/akshitasingh/Downloads/273A_ML/1_MLProject/dataset_diabetes/diabetic_data.csv", delimiter=None) 
diabetes = pd.read_csv("diabetic_data.csv", delimiter=None) 
diabetes = pd.DataFrame(diabetes)

In [None]:
diabetes.shape

In [None]:
diabetes.columns

# Data Preprocessing
### Numerical Features 
In this dataset the feature names make numerical value features self-evident. Each column with numerical features starts with "num_..."
### Categorical Features
Essentially every feature that is numerical can be considered categorical but it is not as simple as that. 
1) 2 features are patient ID features: ['encounter_id', 'patient_nbr']. It does not make sense to include them (unless we are considering a personalized Machine Learning model) <br>
2) It also does not make sense to include the target feature which is also categorical: ['readmitted'] <br>
3) Certain features have numerical values that represent categories, such as: ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']. This is something we will investigate further. 


In [None]:
# list column names of features that consist of numeric values
# (in this dataset the feature names make numerical value features self-evident)
feat_num = ['num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']

# numerics = ['int16','int32','int64','float16','float32','float64']
# feat_num = list(diabetes.select_dtypes(include=numerics).columns)

# list column names of features that consist of categorical values
feat_cat = ['race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty', 'diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed']

#### Count distinct values for Categorical Features

In [None]:
cat_count = defaultdict(int)
for f in feat_cat:
    cat_count[f] = len(diabetes[f].value_counts())
cat_count

In [None]:
# The features "examide" and "citoglipton" have only one value through so they can be dropped from consideration
diabetes = diabetes.drop(['examide', 'citoglipton'], axis = 1)
feat_cat = [f for f in feat_cat if f not in ('examide', 'citoglipton')]

In [None]:
# List of all medication features after removing  "examide" and "citoglipton"
medications = list(diabetes.columns)[24:45]

## Categorical features - Investigating Categories
We picked some features we thought would be relevant to look into further <br>

### Discharge Disposition ID: 
From the ID mapping that UCI ML Repository shared with us, some categories here relate to death or terminally ill facilities. Any patient that falls into these categories should possibly not be considered in our predictions because there is no way they can be readmitted. If we were to consider them, we would possibly be biasing our predictions towards "NO" readmission, which would be incorrect. Nonetheless, we might want to consider some patients who had multiple re-admissions and hence we will not completely eliminate all patients that fall in the death/hospice categories

In [None]:
## drop rows where discharge_disposition_id indicates death or hospice
# diabetes = diabetes.drop(diabetes[diabetes.discharge_disposition_id.isin([11,13,14,19,20,21])].index)
## OR, Create a Boolean for patients that died/went to hospice vs that didn't
diabetes['disposition_boolean'] = np.where((diabetes['discharge_disposition_id'].isin([11,13,14,19,20,21])),1,0)
diabetes['discharge_disposition_id'].value_counts()
feat_cat.append('disposition_boolean')

### Diagnosis Features - diag_1, diag_2, diag_3: 
- Each of the 3 features containts 700+ categories of type string <br> 
- Some of these categories are essentially numbers (floats) while others are hard strings <br>
- We convert all the strings that can be converted into floats, and coerce the others into 'nan' <br>
- Any unknowns (?) and non-float diagnisis (ex. V50) are then categorized as "Other" 

In [None]:
def diag_cat(diag_feat):
    diabetes[diag_feat] = pd.to_numeric(diabetes[diag_feat],errors= 'coerce')
    diabetes[diag_feat] = diabetes[diag_feat].fillna(0)
    
    for ind in range(len(diabetes)):
        if diabetes[diag_feat][ind] == 'nan':
            diabetes[diag_feat][ind] = "Other"
        elif round(diabetes[diag_feat][ind]) in [250,251]:
            diabetes[diag_feat][ind] = "Diabetes"
        elif diabetes[diag_feat][ind] in range(390,460) or diabetes[diag_feat][ind] == 785:
            diabetes[diag_feat][ind] = "Circulatory"
        elif diabetes[diag_feat][ind] in range(460,520) or diabetes[diag_feat][ind] == 786:
            diabetes[diag_feat][ind] = "Respiratory"
        elif diabetes[diag_feat][ind] in range(520,580) or diabetes[diag_feat][ind] == 787:
            diabetes[diag_feat][ind] = "Digestive"
        elif diabetes[diag_feat][ind] in range(800,1000):
            diabetes[diag_feat][ind] = "Injury"
        elif diabetes[diag_feat][ind] in range(710,740):
            diabetes[diag_feat][ind] = "Musculoskeletel"
        else:
            diabetes[diag_feat][ind] = "Other"

In [None]:
diag_feat = ['diag_1', 'diag_2', 'diag_3']

for f in diag_feat:
    diag_cat(f)

In [None]:
diabetes['diag_1'].value_counts()

### Gender, Age, Admissions Type, and Admissions Source
Age is a categorical feature, which we can consider turning into a numerical value by finding the mean of each range. This converts the feature into a numeric (but we shall not consider this as part of PCA because we understand that Age is an important factor in understanding readmissions and hence should not be put under the dimensionality reduction bucket. <br>

In [None]:
# Gender
diabetes = diabetes[diabetes['gender'] != 'Unknown/Invalid']
diabetes['gender'].value_counts()

# Age
age_dict = {'[0-10)' : 5,
'[10-20)' : 15,
'[20-30)' : 25, 
'[30-40)' : 35, 
'[40-50)' : 45, 
'[50-60)' : 55,
'[60-70)' : 65, 
'[70-80)' : 75,
'[80-90)' : 85,
'[90-100)' : 95}

diabetes['age'] = diabetes['age'].apply(lambda x : age_dict[x])

# Admissions
diabetes['admission_type_id'] = \
diabetes['admission_type_id'].apply(lambda x : 'Unavailable' if int(x) in [5, 6, 8]
                                                            else 'Elective')



diabetes['admission_source_id'] = \
diabetes['admission_source_id'].apply(lambda x : 'Referral' if int(x) in [5, 6, 8]
                                            else ('Emergency Room' if int(x) in [7]
                                            else 'Other'))


## Investigating Multiple Readmissions
Some patients show up more than once in our dataset. It is a very small subset of the larger dataset so we first start with considering only once patient visit - which, in our case, would be the very last occurance for that patient. 

In [None]:
## Count the number of multiple readmissions for a single patient
# data = diabetes[diabetes['readmitted'] != 'NO']
# unique_patients = data[['patient_nbr']]
# unique_patients = unique_patients['patient_nbr'].value_counts().to_frame()
# unique_patients["index"] = unique_patients.index
# len(unique_patients[unique_patients["patient_nbr"] > 1])

In [None]:
diabetes = diabetes.drop_duplicates(subset= 'patient_nbr', keep='last')
diabetes.shape

## Investigating Missing Values (?) 
Features with missing values: <br>
**Weight** - replaced it with the mode <br>
Another way to impute the missing weights would have been to find the closest neighbors. For us, a "neighbor" would be another patient with similar comorbidities. These comorbidities could be respresented in multiple ways such as (1) diagonasis (dia_1, 2, and 3) 2) number of meds, <br>
**Race** - replaced it with "UNK" <br>
**Medical Speciality** - replaced it with "UNK" <br>
**Payer Code** - replaced it with "UNK" <br>

(**diag_1, diag_2, diag_3** also had missing values but those have already been handled above)

In [None]:
for col in diabetes.columns:
    if diabetes[col].dtype == object:
        count = diabetes[col][diabetes[col] == '?'].count()
        if count > 0:
            print(col, count)

In [None]:
## Weights: Because most weights are missing, we replace the ? with most common weight
diabetes['weight'] = np.where((diabetes['weight'] == "?"),"[75-100)",diabetes['weight'])
## Race: replace with UNK
diabetes['race'] = np.where((diabetes['race'] == "?"),"UNK",diabetes['race'])
## Medical Speciality: replace with UNK
diabetes['medical_specialty'] = np.where((diabetes['medical_specialty'] == "?"),"UNK",diabetes['medical_specialty'])
## payer_code: replace with UNK
diabetes['payer_code'] = np.where((diabetes['race'] == "?"),"UNK",diabetes['payer_code'])

In [None]:
# drop payer_code because it doesn't seem to explain very much 
# drop medical speciality because it has too many missing values
feat_cat = [f for f in feat_cat if f not in ('payer_code', 'medical speciality')]

### Data after Preprocessing

In [None]:
X = diabetes[feat_num + feat_cat]
y = diabetes['readmitted']

# Train Test Split

In [None]:
Xtr, Xte, Ytr, Yte = train_test_split(X, y, test_size=0.3, random_state=1)

# Feature Selection

## Feature Scaling - should we scale before train_test_split???
As a first step, we will only normalize the numerical features. Later on, we will consider normalizing all features (for instance, if we use a multivariate feature selection model such as Lasso)

In [None]:
# convert numerical features from strings to floats
for f in feat_num:
    diabetes[f] = pd.to_numeric(diabetes[f],errors= 'coerce')
    
scaler = StandardScaler()

# scaling training data
Xtr_num = Xtr[feat_num]
scaler.fit(Xtr_num)
Xtr_numSc = scaler.fit_transform(Xtr_num)

# transforming test data based on the fit from training data
Xte_num = Xte[feat_num]
Xte_numSc = scaler.transform(Xte_num)

In [None]:
# construct the new training data with scaled features
Xtr_cat = np.array(Xtr[feat_cat])
Xtr = np.concatenate([Xtr_numSc, Xtr_cat], axis = 1)
Xtr = pd.DataFrame(Xtr)
Xtr.columns = [feat_num + feat_cat]

# construct the new test data with the scaled features
Xte_cat = np.array(Xte[feat_cat])
Xte = np.concatenate([Xte_numSc, Xte_cat], axis = 1)
Xte = pd.DataFrame(Xte)
Xte.columns = [feat_num + feat_cat]

## Feature Selection - Numerical Features  (Principal Component Analysis)

In [None]:
pca = PCA().fit(Xtr_numSc)

plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 8, step=1)
Var = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, Var, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 11, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.90, color='r', linestyle='-')
plt.text(0.5, 0.85, '90% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()

In [None]:
# compoonents that explain 90% of the variances 
pca = PCA(n_components=5)

# fit PCA on the training data
XtrPCA_num = pca.fit_transform(Xtr_numSc)
XtrPCA_num = pd.DataFrame(XtrPCA_num)
XtrPCA_num.columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']
# XtrPCA_num

# transform test data based on the PCA fit from training
XtePCA_num = pca.transform(Xte_numSc)
XtePCA_num = pd.DataFrame(XtePCA_num)
XtePCA_num.columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']
# XtePCA_num

## Feature Selection - Categorical Features 

### Step 1: Investigate the value count for each medication
If we realize that hardly anyone was prescribed that medication, it is perhaps a good idea to not consider it in our analysis <br>

We run the risk of excluding patients that were specifically chosen for rare medications which hardly prescribed (and hence eliminated from our feature set). 

In [None]:
threshold = 70000

med_count = defaultdict(list)

for med in medications:
    # count the number of Nos, Ups, Downs, and Steadys for each medication
    med_count[med].append(list(diabetes[med].value_counts()))
    
    # if the number of Nos is > 100K, disregard the medication for now 
    if med_count[med][0][0] > threshold:
        med_count.pop(med)
med_count
meds_new = list(med_count)
# meds_new

### Step 2: Chi Square

Lets first try to find any relations between the medication features <br>

#### Approach 1: Cross Tabulation
$D$ = Number of Medication features <br>
Null Hypothesis ($H_O$): Features are independent - there is no relationship between features $x^i$ and $x^j$ where $i, j$ $\in$ $(1,...,D)$ <br>
Alternate Hypothesis ($H_1$): Features are independent - there is a relationship between features <br>
Let's consider p-value for $H_O$ = .05 $\Rightarrow$ if p-value for a relation is < .05, then we fail to reject $H_O$ <br>

As we can see, none of our p-values are greater than the significance level, so we fail to reject the null hypothesis for any of them. Thus, we continue to consider all the medication features to be independent from each other. 

In [None]:
chi_p = [[0]*7 for _ in range(7)]
for med1 in meds_new:
    for med2 in meds_new:
        chi_p[meds_new.index(med1)][meds_new.index(med2)] = chi2_contingency(pd.crosstab(diabetes[med1], diabetes[med2]))[1]
chi_p = np.array(chi_p)   
# chi_p       

#### Approach 2: Ordinal / One Hot Encoding
come back to it

In [None]:
# # encode some catagorical features in the input data
# def prepare_inputs(Xtr, Xte):
#     ordEnc = OrdinalEncoder()
#     ordEnc.fit(Xte)
#     XtrEnc = ordEnc.transform(Xtr)
#     XteEnc = ordEnc.transform(Xte)
#     return XtrEnc, XteEnc
 
# # encode the target feature (categorical)
# def prepare_targets(Ytr, Yte):
#     labEnc = LabelEncoder()
#     labEnc.fit(Ytr)
#     YtrEnc = labEnc.transform(Ytr)
#     YteEnc = le.transform(Yte)
#     return YtrEnc, YteEnc
 
# # feature selection
# ## concern - this can only work if we only have categorical features 
# def select_features(Xtr, Xte, Ytr):
#     featSel = SelectKBest(score_func=chi2, k='all')
#     featSel.fit(Xtr, Ytr)
#     XtrSel = featSel.transform(Xtr)
#     XteSel = featSel.transform(Xte)
#     return XtrSel, XteSel, featSel

### Final set of categorical features

In [None]:
# disposition id replaced by the booleans
feat_catN = ['gender', 'age', 'weight', 'admission_type_id', 'admission_source_id', 'disposition_boolean',
           'time_in_hospital'] + diag_feat + meds_new + ['change', 'diabetesMed']

In [None]:
Xtr_cat = Xtr[feat_catN]
Xte_cat = Xte[feat_catN]

## Final Train and Test Data with Selected Catagorical Features and PCAs 

In [None]:
features = list(XtrPCA_num.columns) + feat_catN

# final training data 
Xtr = np.concatenate([np.array(XtrPCA_num), np.array(Xtr_cat)], axis = 1)
Xtr = pd.DataFrame(Xtr)
Xtr.columns = features
# Xtr

# final test data
Xte = np.concatenate([np.array(XtePCA_num), np.array(Xte_cat)], axis = 1)
Xte = pd.DataFrame(Xte)
Xte.columns = features
# Xte

## One Hot Encode the Final Set of Categorical Features (If needed)

In [None]:
# Features to One Hot Encode
feat_OHE = ['gender', 'weight', 'admission_type_id', 'admission_source_id', 'disposition_boolean'] + \
            diag_feat + meds_new + ['change', 'diabetesMed']
Xtr_OHE = Xtr[feat_OHE]


# fit OHE on to the training data
OHE = OneHotEncoder(categories='auto')
Xtr_OHE = OHE.fit_transform(Xtr_OHE).toarray()
Xtr_OHE = pd.DataFrame(Xtr_OHE)

# transform OHE fit into test data
Xte_OHE = Xte[feat_OHE]
Xte_OHE = OHE.transform(Xte_OHE).toarray()
Xte_OHE = pd.DataFrame(Xte_OHE)

In [None]:
# extract column names for OHE categories
OHE_cols = []
OHE_cols_ = OHE.categories_
for col, vals in zip(Xtr_OHE.columns, OHE_cols_):
    for val in vals:
        name = str(col) + '_' + str(val)
        OHE_cols.append(name)

Xtr_OHE.columns = OHE_cols
Xte_OHE.columns = OHE_cols

In [None]:
Xtr = pd.concat([XtrPCA_num, Xtr_OHE], axis = 1)
Xte = pd.concat([XtePCA_num, Xte_OHE], axis = 1)

In [None]:
Xtr.shape

In [None]:
Xte.shape

# Train Models
We try to predict 3 things: <br>
1) Readmissions <br>
2) Readmissions <30 and >30 days <br>
3) Multiple Readmissions <br>

# Problem 1: Predict Readmissions

In [None]:
Ytr = (Ytr!="NO").astype(int).reset_index(drop = True)
Yte = (Yte!="NO").astype(int).reset_index(drop = True)

In [None]:
# oversampling
# smote = SMOTE(random_state=0)
# XtrSm, YtrSm = smote.fit_resample(Xtr, Ytr)

In [None]:
cc = ClusterCentroids(random_state=0)
XtrSm2, YtrSm2 = cc.fit_resample(X, y)

## Model 1: Logistic Regression

We run 6 different logistic regression models: <br>
1) logReg_Bal_L1 <br>
>   a) Balanced dataset <br>
    b) L1 Regularization (Lasso) <br>

2) logReg_Bal_L2 <br>
>   a) Balanced dataset <br>
    b) L2 Regularization (Ridge) <br>

3) logReg_Unbal_L1 <br>
>   a) Balanced dataset <br>
    b) L1 Regularization (Lasso) <br>

4) logReg_Unbal_L2 <br>
>   a) Unbalanced dataset <br>
    b) L2 Regularization (Ridge) <br>

2) logReg_Smote_L1 <br>
>   a) Balanced dataset <br>
    b) L1 Regularization (Lass) <br>

5) logReg_Smote_L2 <br>
>   a) Balanced dataset <br>
    b) L2 Regularization (Ridge) <br>

In [None]:
# running logistic regression on balanced dataset, while including L1 Regularization (Lasso)


logReg_Bal_L1 = LogisticRegression(fit_intercept = True, class_weight= 'balanced', 
                                   penalty = 'l1', solver='liblinear')#.fit(Xtr, Ytr)

# running logistic regression on balanced dataset, while including L2 Regularization (Ridge)
logReg_Bal_L2 = LogisticRegression(fit_intercept = True, class_weight= 'balanced', 
                                   penalty = 'l1', solver='liblinear')#.fit(Xtr, Ytr)

# running logistic regression on un-balanced dataset, while including L1 Regularization (Ridge)
logReg_Unbal_L1 = LogisticRegression(fit_intercept = True, 
                                   penalty = 'l1', solver='liblinear')#.fit(Xtr, Ytr)


# running logistic regression on un-balanced dataset, while including L2 Regularization (Ridge)
logReg_Unbal_L2 = LogisticRegression(fit_intercept = True, 
                                   penalty = 'l2', solver='liblinear')#.fit(Xtr, Ytr)


# running logistic regression using SMOTE with unbalanced dataset, while including L1 Regularization (Lasso)
logReg_Smote_L1 = LogisticRegression(fit_intercept = True, 
                                   penalty = 'l1', solver='liblinear')#.fit(Xtr, Ytr)


# running logistic regression using SMOTE with unbalanced dataset, while including L2 Regularization (Ridge)
logReg_Smote_L2 = LogisticRegression(fit_intercept = True, 
                                   penalty = 'l1', solver='liblinear')#.fit(Xtr, Ytr)
                                                                                                                                                                                                                

### Cross Validation

In [None]:
cross_val_score(logReg_Smote_L2, Xtr, Ytr, cv = 4).mean()

In [None]:
# logReg_models = [logReg_Bal_L1, logReg_Bal_L2, logReg_Unbal_L1, logReg_Unbal_L2,
#                 logReg_Smote_L1, logReg_Smote_L2]

# def CV(model, K):
#     cv_list = []
#     for k in range(K):
#         cv_list.append(cross_val_score(logReg_Smote_L2, Xtr, Ytr, cv = 4).mean())
    
#     return cv_list


# # for model in logRed_models:
# #     log
    
# log1_cv = CV(logReg_Smote_L1, 5)

In [None]:
logReg_Bal_L1 = logReg_Bal_L1.fit(Xtr, Ytr)

In [None]:
Yte_hat = logReg_Bal_L1.predict(Xte)

In [None]:
fpr, tpr, threshold = metrics.roc_curve(Yte, Yte_hat)

In [None]:
roc_auc = metrics.auc(fpr, tpr)

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
roc_tr = metrics.plot_roc_curve(logReg_Smote_L2.fit(Xtr,Ytr), Xtr, Ytr)
# roc_te = metrics.plot_roc_curve(logReg_Smote_L2.fit(Xtr,Ytr), Xte, Yte)

In [None]:
roc_tr

## Model 2: SVMs

In [None]:
svmLinear = svm.SVC(kernel='linear', random_state=100)
svmRBF = svm.SVC(kernel='rbf', random_state=100)
svmPoly = svm.SVC(kernel='poly', random_state=100)
logistic = LogisticRegression()
dTree = DecisionTreeClassifier(random_state=0)

In [None]:
# cross_val_score(svmLinear, Xtr, Ytr, cv=5)
# cross_val_score(svmRBF, Xtr, Ytr, cv=5)
# cross_val_score(svmPoly, Xtr, v, cv=5)
# cross_val_score(dTree, Xtr, Ytr, cv=5)

## Model 3: Ensembles

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(Xtr, Ytr)

In [None]:
probas = rf_clf.predict_proba(Xte)


In [None]:
# get false and true positive rates
fpr, tpr, thresholds = roc_curve(Yte, probas[:,0], pos_label=0)

# get area under the curve
roc_auc = auc(fpr, tpr)

# PLOT ROC curve
plt.figure(dpi=150)
plt.plot(fpr, tpr, lw=1, color='green', label=f'AUC = {roc_auc:.3f}')
plt.title('ROC Curve for RF classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.legend()
plt.show()

## Model 4: Neural Networks

# Comparing Models