In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import  KNeighborsClassifier
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
from sklearn.svm import SVC

# Data Analysis 

##### 1. Number of times pregnant
##### 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
##### 3. Diastolic blood pressure (mm Hg)
##### 4. Triceps skin fold thickness (mm)
##### 5. 2-Hour serum insulin (mu U/ml)
##### 6. Body mass index (weight in kg/(height in m)^2)
##### 7. Diabetes pedigree function
##### 8. Age (years)
##### 9. Class variable (0 or 1)

In [None]:
dataset=pd.read_csv("diabetes.csv")

In [None]:
dataset

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset.info()

In [None]:
dataset["Outcome"].value_counts()

# maxmuim value of Insulin where patient has high propapilty for diabetes

In [None]:
dataset[dataset['Outcome']== 1]['Insulin'].max()

# maxmuim value of Glucose where patient has high propapilty for diabetes

In [None]:
dataset[dataset['Outcome']== 1]['Glucose'].max()

In [None]:
dataset['Pregnancies'].nunique()

# number of Pregnancy times where patient has high propapilty for diabetes

In [None]:
dataset[dataset['Outcome']== 1]['Pregnancies'].value_counts()

In [None]:
dataset[dataset['Outcome']== 0]['Pregnancies'].value_counts()

In [None]:
def age_count(x):
    if x in (dataset[dataset['Outcome']== 0]['Age']):
        return False
    else:
        return x

# get ages which have only the propapilty to have diabetes

In [None]:
dataset[dataset['Outcome']== 1]['Age'].apply(lambda x: age_count(x)).value_counts()

#  Data Visualization

In [None]:
sns.countplot(dataset["Outcome"])

In [None]:
dataset.groupby(by="Pregnancies")["Outcome"].sum().sort_values(ascending=False).plot(kind = "bar")
plt.show()

In [None]:
plt.figure(figsize = (7, 4))
sns.distplot(dataset["Glucose"])
plt.show("Glucose distribution plot")
plt.show()

In [None]:
plt.figure(figsize = (7, 4))
sns.distplot(dataset["Glucose"])
plt.show("Glucose distribution plot after Imputing with mean")
plt.show()

In [None]:
plt.figure(figsize = (7, 4))
sns.distplot(dataset["BloodPressure"])
plt.title("BloodPressure Distribution Plot")
plt.show()

In [None]:
continuous_data_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']
plt.figure(figsize = (11,7))
sns.heatmap(dataset[continuous_data_cols].corr(), center = 0, annot = True)
plt.title("Correlation Plot")
plt.show()

In [None]:
print(dataset.isnull().sum())
dataset.isnull().sum().plot(kind = "bar")
plt.title("NaN values Plot")
plt.show()

In [None]:
sns.pairplot(dataset,hue='Outcome')

In [None]:
sns.heatmap(dataset.corr(),cmap='coolwarm')

In [None]:
sns.jointplot(data=dataset ,x='Glucose', y='Insulin', kind="hex")

In [None]:
def target_percent():
    trace = go.Pie(labels = ['healthy','diabetic'], values = dataset['Outcome'].value_counts(), 
                   textfont=dict(size=15), opacity = 0.8,
                   marker=dict(colors=['lightskyblue', 'gold'], 
                               line=dict(color='#000000', width=1.5)))


    layout = dict(title =  'Distribution of Outcome variable')

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

In [None]:
target_percent()

In [None]:
plt.style.use('ggplot') # Using ggplot2 style visuals 

f, ax = plt.subplots(figsize=(11, 15))

ax.set_facecolor('#fafafa')
ax.set(xlim=(-.05, 200))
plt.ylabel('Variables')
plt.title("Overview Data Set")
ax = sns.boxplot(data = dataset, 
  orient = 'h', 
  palette = 'Set2')

In [None]:
def correlation_plot():
    #correlation
    correlation = dataset.corr()
    #tick labels
    matrix_cols = correlation.columns.tolist()
    #convert to array
    corr_array  = np.array(correlation)
    trace = go.Heatmap(z = corr_array,
                       x = matrix_cols,
                       y = matrix_cols,
                       colorscale='Viridis',
                       colorbar   = dict() ,
                      )
    layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                            #autosize = False,
                            #height  = 1400,
                            #width   = 1600,
                            margin  = dict(r = 0 ,l = 100,
                                           t = 0,b = 100,
                                         ),
                            yaxis   = dict(tickfont = dict(size = 9)),
                            xaxis   = dict(tickfont = dict(size = 9)),
                           )
                      )
    fig = go.Figure(data = [trace],layout = layout)
    py.iplot(fig)

In [None]:
correlation_plot()

In [None]:
def plot_feat1_feat2(feat1, feat2) :  
    D = dataset[(dataset['Outcome'] != 0)]
    H = dataset[(dataset['Outcome'] == 0)]
    trace0 = go.Scatter(
        x = D[feat1],
        y = D[feat2],
        name = 'diabetic',
        mode = 'markers', 
        marker = dict(color = '#FFD700',
            line = dict(
                width = 1)))

    trace1 = go.Scatter(
        x = H[feat1],
        y = H[feat2],
        name = 'healthy',
        mode = 'markers',
        marker = dict(color = '#7EC0EE',
            line = dict(
                width = 1)))

    layout = dict(title = feat1 +" "+"vs"+" "+ feat2,
                  yaxis = dict(title = feat2,zeroline = False),
                  xaxis = dict(title = feat1, zeroline = False)
                 )

    plots = [trace0, trace1]

    fig = dict(data = plots, layout=layout)
    py.iplot(fig)

In [None]:
plot_feat1_feat2('Glucose','Age')

In [None]:
plot_feat1_feat2('Pregnancies','Age')

In [None]:
plot_feat1_feat2('Glucose','BloodPressure')

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Pregnancies',hue='Outcome',data=dataset,palette='RdBu_r')

In [None]:
y=dataset['Outcome']
x=dataset.drop('Outcome',axis=1)

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.3, random_state=101)

# 1. Building a Logistic Regression model

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,Y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,predictions))

In [None]:
print(classification_report(Y_test,predictions))

In [None]:
precision, recall, _ = precision_recall_curve(Y_test, predictions)
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
average_precision = average_precision_score(Y_test,predictions)

plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
          average_precision))
plt.show()

# 2. Building a Decision tree model

In [None]:
clf=tree.DecisionTreeClassifier(criterion='gini',min_samples_split=30,splitter="best")
clf=clf.fit(X_train,Y_train)
y_pred=clf.predict(X_test)

In [None]:
accuracy=accuracy_score(Y_test,y_pred)
average_precision = average_precision_score(Y_test,y_pred)

from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(Y_test,y_pred))


In [None]:
print(classification_report(Y_test,y_pred))

In [None]:
precision, recall, _ = precision_recall_curve(Y_test, y_pred)
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')

plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
          average_precision))
plt.show()

In [None]:
# Accuray On Test Data
accuracy = accuracy_score(Y_test, predictions)
print(f"Accuracy on Test Data: {accuracy*100}%")
print(f"Precision Score: {precision_score(Y_test, predictions)}")
print(f"Recall Score: {recall_score(Y_test, predictions)}")
print(f"F1 Score: {f1_score(Y_test, predictions)}")
plt.title("Confusion Matrix for Test Data")
plt.plot()
plt.show()


# 3.Building A KNN model

In [None]:
knn=KNeighborsClassifier()
knn.fit(X_train,Y_train)
y_pred_knn=knn.predict(X_test)

In [None]:
accuracy=accuracy_score(Y_test,y_pred_knn)
average_precision = average_precision_score(Y_test,y_pred_knn)

from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(Y_test,y_pred_knn))

In [None]:
print(classification_report(Y_test,y_pred_knn))

In [None]:
precision, recall, _ = precision_recall_curve(Y_test, y_pred_knn)
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')

plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
          average_precision))
plt.show()

# 4. Building Svm Model

In [None]:
Svm_model=SVC()
Svm_model.fit(X_train,Y_train)
y_pred_svm=Svm_model.predict(X_test)

In [None]:
accuracy=accuracy_score(Y_test,y_pred_svm)
average_precision = average_precision_score(Y_test,y_pred_svm)

from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(Y_test,y_pred_svm))

In [None]:
precision, recall, _ = precision_recall_curve(Y_test, y_pred_svm)
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')

plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
          average_precision))
plt.show()

# Building Random Forest Model

In [None]:
Random_forest=RandomForestClassifier()
Random_forest.fit(X_train,Y_train)
y_pred_forest=Random_forest.predict(X_test)

In [None]:
accuracy=accuracy_score(Y_test,y_pred_forest)
average_precision = average_precision_score(Y_test,y_pred_forest)

from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(Y_test,y_pred_forest))