### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

%matplotlib inline
pd.set_option('max_rows',20000)
pd.set_option('max_columns',11)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score 
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, recall_score, precision_score
from sklearn.metrics import f1_score
from mlxtend.plotting import plot_confusion_matrix

ModuleNotFoundError: No module named 'mlxtend'

### Load data and explore

In [None]:
df = pd.read_csv("diabetes.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df['PatientID'].value_counts()

In [None]:
df['PatientID'].nunique()

In [None]:
#plt.figure(figsize=(10,10))
#sns.countplot(x=df.PatientID)
#plt.title("PatientID")
#plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df.Pregnancies)
plt.title("Pregnancies")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df.PlasmaGlucose)
plt.title("PlasmaGlucose")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df.DiastolicBloodPressure)
plt.title("DiastolicBloodPressure")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df.TricepsThickness)
plt.title("TricepsThickness")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df.SerumInsulin)
plt.title("SerumInsulin")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df.BMI)
plt.title("BMI")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df.DiabetesPedigree)
plt.title("DiabetesPedigree")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.hist(df.Age)
plt.title("Age")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(df.Diabetic)
plt.title("Diabetic")
plt.show()

### Load doctors dataset

In [None]:
df2 = pd.read_csv("doctors.csv",encoding = "ISO-8859-1")

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=df2.Physician,y=df2.PatientID)
plt.title("PatientID per Physician")
plt.xticks(rotation=90)
plt.show()

In [None]:
df3 = pd.merge(left=df,right=df2,how='left',on='PatientID')

In [None]:
df3.head()

In [None]:
df3.shape

In [None]:
mask = df3['PatientID'] == 1321191

In [None]:
df3[mask] # Check to ensure physicians correct

In [None]:
df3.head()

In [None]:
df3.shape

In [None]:
#Save a copy of merged file
#df3.to_csv("final.csv",index=False)

### Load the final.csv file

In [None]:
df3 = pd.read_csv("final.csv")

### Create pairplots to look at relationships

In [None]:
df4 = df3.drop(['PatientID','Diabetic','Physician'],axis=1)

In [None]:
df4.head()

In [None]:
df4.describe()

In [None]:
sns.pairplot(df4.sample(1000),height=4)
plt.title("Distribution of Numerical Features")
plt.show()

### Create boxplots

In [None]:
plt.figure(figsize=(12,12))

plt.subplot(4,2,1)
plt.boxplot(df4.Pregnancies)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Pregnancies')

plt.subplot(4,2,2)
plt.boxplot(df4.PlasmaGlucose)
plt.xlabel('x')
plt.ylabel('y')
plt.title('PlasmaGlucose')

plt.subplot(4,2,3)
plt.boxplot(df4.DiastolicBloodPressure)
plt.xlabel('x')
plt.ylabel('y')
plt.title('DiastolicBloodPressure')

plt.subplot(4,2,4)
plt.boxplot(df4.TricepsThickness)
plt.xlabel('x')
plt.ylabel('y')
plt.title('TricepsThickness')

plt.subplot(4,2,5)
plt.boxplot(df4.SerumInsulin)
plt.xlabel('x')
plt.ylabel('y')
plt.title('SerumInsulin')

plt.subplot(4,2,6)
plt.boxplot(df4.BMI)
plt.xlabel('x')
plt.ylabel('y')
plt.title('BMI')

plt.subplot(4,2,7)
plt.boxplot(df4.DiabetesPedigree)
plt.xlabel('x')
plt.ylabel('y')
plt.title('DiabetesPedigree')

plt.subplot(4,2,8)
plt.boxplot(df4.Age)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Age')


plt.show()

### Preprocessing Data

In [None]:
df3.head()

In [None]:
df3['AgeLog'] = np.log(df3['Age'])

In [None]:
df3.head()

In [None]:
plt.hist(df3['AgeLog'])
plt.title("Age in Log form")
plt.show()

In [None]:
df3 = df3.drop(['Age'],axis=1)

In [None]:
df3.head()

In [None]:
df3.shape

In [None]:
df3.info()

In [None]:
df5 = df3[['PatientID','Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
          'SerumInsulin','BMI','DiabetesPedigree','AgeLog','Physician','Diabetic']]

In [None]:
df5.head()

In [None]:
df5.tail()

In [None]:
df5.shape

In [None]:
#Save a new csv for arranged dataset
#df5.to_csv("arranged.csv",index=False)

### Data Scaling

In [None]:
df5 = pd.read_csv("arranged.csv")

In [None]:
df5.head()

In [None]:
df5.info()

In [None]:
df5['PatientID'] = df5['PatientID'].astype('category')

In [None]:
df5['PatientID'].dtype

In [None]:
df5.info()

In [None]:
df5.head()

In [None]:
X = df5.iloc[:,0:10]
y = df5.iloc[:,10]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_scalar = X[['PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI']]

In [None]:
X_scalar.shape

In [None]:
X_scalar.values

In [None]:
X_minmax = X[['Pregnancies','DiabetesPedigree','AgeLog']]

In [None]:
X_minmax.shape

In [None]:
X_minmax.values

### Normalize Data using Column Transformer

Each transformer is a three-element tuple that defines the name of the transformer, 
the transform to apply, and the column indices to apply it to. For example:
(Name, Object, Columns)


In [None]:
# define the data preparation for the columns

t1 = [('scale',StandardScaler(),X_scalar)]
t2 = [('ohe',MinMaxScaler(),X_minmax)]

In [None]:
col_transform = ColumnTransformer(transformers=t1, remainder='passthrough')

In [None]:
col_transform.fit_transform(X)

### Use make_column_transformer

In [None]:
make_column_transformer((StandardScaler(),[X_scalar]),remainder="passthrough")

In [None]:
ct1 = ColumnTransformer(transformers=('standardscaler', StandardScaler(),['X_scalar']),remainder="passthrough")

In [None]:
X_new = ct1.fit(X)

### Use StandardScaler method

In [None]:
scaler = StandardScaler()

In [None]:
X_scale_new = scaler.fit_transform(X_scalar)

In [None]:
X_scale_new

In [None]:
X_scale_new = pd.DataFrame(X_scale_new, columns=X_scalar.columns)

In [None]:
X_scale_new.head()

### Use MinMax method

In [None]:
minmax = MinMaxScaler()

In [None]:
X_mm_new = minmax.fit_transform(X_minmax)

In [None]:
X_mm_new

In [None]:
X_mm_new = pd.DataFrame(X_mm_new, columns=X_minmax.columns)

In [None]:
X_mm_new.head()

### Merge the transformed dataset to existing one

In [None]:
df5.head()

In [None]:
df6 = df5.drop(['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
               'SerumInsulin','BMI','DiabetesPedigree','AgeLog'],axis=1)

In [None]:
df6.head()

In [None]:
df6.shape

In [None]:
df7 = pd.concat([df6,X_scale_new,X_mm_new],axis=1)

In [None]:
df7.head()

In [None]:
df7.shape

In [None]:
df7.describe() #Check if all transformed correctly

The PatientID and Physician columns are unlikely to contribute any predictive information for diagnosing diabetes

In [None]:
df8 = df7.drop(['PatientID','Physician'],axis=1)

In [None]:
df8.head()

In [None]:
df8.shape

In [None]:
df8.info()

In [None]:
#Rearrange columns
df8 = df8[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
          'SerumInsulin','BMI','DiabetesPedigree','AgeLog','Diabetic']]

In [None]:
df8.head()

In [None]:
df8.tail()

In [None]:
df8.shape

In [None]:
#Save to a new csv file
#df8.to_csv("cleaned.csv",index=False)

### Train and Evaluate the Classification Model

In [None]:
df8 = pd.read_csv("cleaned.csv")

In [None]:
df8.shape

In [None]:
df8.info()

In [None]:
df8.head()

In [None]:
X = df8.iloc[:,0:8]
y = df8.iloc[:,8]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X.values, y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234,stratify=None)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
model = LogisticRegression(penalty='l2',tol=1e-7,random_state=1234,solver='liblinear')

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

### Logistic Regression Model Evaluation

In [None]:
roc_auc_score(y_test,y_pred)  #ROC Score

In [None]:
cm = confusion_matrix(y_test,y_pred)

In [None]:
cm

In [None]:
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title("Confusion Matrix")
plt.show()

In [None]:
plot_roc_curve(model, X_test, y_test) 
plt.title("ROC Curve")
plt.show()

In [None]:
accuracy_score(y_test,y_pred) #Accuracy score

In [None]:
recall_score(y_test,y_pred) #Recall score

In [None]:
precision_score(y_test,y_pred) #Precision score

In [None]:
f1_score(y_test,y_pred) #F1-score

### Gradient Tree Boosting

In [None]:
model2 = GradientBoostingClassifier(max_leaf_nodes=20,min_samples_leaf=10,learning_rate=0.2,
                                   n_estimators=100, random_state=1234)

In [None]:
model2.fit(X_train,y_train)

In [None]:
y_pred = model2.predict(X_test)

In [None]:
y_pred

### Gradient Boosting Model Evaluation

In [None]:
roc_auc_score(y_test,y_pred)  #ROC Score

In [None]:
cm = confusion_matrix(y_test,y_pred)

In [None]:
cm

In [None]:
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title("Confusion Matrix")
plt.show()

In [None]:
plot_roc_curve(model2, X_test, y_test) 
plt.title("ROC Curve")
plt.show()

In [None]:
accuracy_score(y_test,y_pred) #Accuracy score

In [None]:
recall_score(y_test,y_pred) #Recall score

In [None]:
precision_score(y_test,y_pred) #Precision score

In [None]:
f1_score(y_test,y_pred) #F1-score

In [None]:
print(classification_report(y_test,y_pred))

**Conclusion: Gradient Boosting Classifier gives better result**

### Import cleaned test data for prediction

In [None]:
test = pd.read_csv("testcleaned.csv")

In [None]:
test

In [None]:
y_pred2 = model2.predict(test)

In [None]:
y_pred2

In [None]:
model2.predict_proba(test) #Predict class probabilities for test set

### Prediction is all 0, means no diabetes for these three patients