In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
import time

init_notebook_mode(connected=True)

In [3]:
def sigmoid(X, weight):
    z = np.dot(X, weight)
    return 1 / (1 + np.exp(-z))

In [4]:
# Using Loss minimizing
def loss(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

def gradient_descent(X, h, y):
    return np.dot(X.T, (h - y)) / y.shape[0]
def update_weight_loss(weight, learning_rate, gradient):
    return weight - learning_rate * gradient

In [5]:
# Using Maximum likelihood estimation
def log_likelihood(x, y, weights):
    z = np.dot(x, weights)
    ll = np.sum(y*z - np.log(1 + np.exp(z)))
    return ll

def gradient_ascent(X, h, y):
    return np.dot(X.T, y - h)
def update_weight_mle(weight, learning_rate, gradient):
    return weight + learning_rate * gradient

In [6]:
data = pd.read_csv("diabetes.csv")
print("Dataset size")
print("Rows {} Columns {}".format(data.shape[0], data.shape[1]))

Dataset size
Rows 768 Columns 9


In [7]:
print("Columns and data types")
pd.DataFrame(data.dtypes).rename(columns = {0:'dtype'})

Columns and data types


Unnamed: 0,dtype
Pregnancies,int64
Glucose,int64
BloodPressure,int64
SkinThickness,int64
Insulin,int64
BMI,float64
DiabetesPedigreeFunction,float64
Age,int64
Outcome,int64


In [8]:
df = data.copy()

In [9]:
diabetic = [1, 0]
fig = {
    'data' : [
        {
            'x': df.loc[(df['Outcome']==outcome), 'Age'],
            'y': df.loc[(df['Outcome']==outcome), 'DiabetesPedigreeFunction'],
            'name': outcome, 'mode': 'markers',
        } for outcome in diabetic
    ],
    'layout': {
        'title' : 'Age vs Diabetes Pedigree Function 1 - Diabetes, 0 - Does not',
        'xaxis' : {'title' : 'Age'},
        'yaxis' : {'title' : 'Diabetes Pedigree Function'}
    }
}

py.offline.iplot(fig)

In [10]:
figs = []

for outcome in diabetic:
    figs.append(
        go.Box(
            y = df.loc[(df['Outcome']==outcome), 'Age'],
            name = outcome
        )
    )
layout = go.Layout(
    title = "Age",
    xaxis = {"title" : "Has diabetes? 1 - Yes, 0 - No"},
    yaxis = {"title" : "Age"},
    width=800,
    height=500
)

fig = go.Figure(data=figs, layout=layout)
py.offline.iplot(fig)

In [11]:
figs = []

for outcome in diabetic:
    figs.append(
        go.Box(
            y = df.loc[(df['Outcome']==outcome), 'DiabetesPedigreeFunction'],
            name = outcome
        )
    )
layout = go.Layout(
    title = "Diabetes Pedigree Function",
    xaxis = {"title" : "Has diabetes? 1 - Yes, 0 - No"},
    yaxis = {"title" : "Diabetes Pedigree Function"},
    width=800,
    height=500
)

fig = go.Figure(data=figs, layout=layout)
py.offline.iplot(fig)

In [12]:
_ = df.groupby('Outcome').size().reset_index()

data = [go.Bar(
    x = _['Outcome'].tolist(),
    y = _[0].tolist(),
    marker=dict(
        color=['rgba(255,190,134,1)', 'rgba(142,186,217,1)'])
)]
layout = go.Layout(
    title = "Outcome distribution",
    xaxis = {"title" : "Has diabetes? 1 - Yes, 0 - No"},
    width = 800,
    height = 500
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

In [13]:
# Assigning features to X and target to y
X = df[['Age','DiabetesPedigreeFunction']].copy()
X2 = df[['Age','DiabetesPedigreeFunction']].copy()
y = df['Outcome'].copy()

In [14]:
# Using Loss minimization
start_time = time.time()

num_iter = 100000

intercept = np.ones((X.shape[0], 1))
X = np.concatenate((intercept, X), axis=1)
theta = np.zeros(X.shape[1])

for i in range(num_iter):
    h = sigmoid(X, theta)
    gradient = gradient_descent(X, h, y)
    theta = update_weight_loss(theta, 0.1, gradient)
    
print("Training time (Log Reg using Gradient descent): " + str(time.time() - start_time) + " seconds")
print("Learing rate: {}\nIteration: {}".format(0.1, num_iter))

Training time (Log Reg using Gradient descent): 3.5176150798797607 seconds
Learing rate: 0.1
Iteration: 100000


In [15]:
result = sigmoid(X, theta)

In [16]:
f = pd.DataFrame(np.around(result, decimals=6)).join(y)
f['pred'] = f[0].apply(lambda x : 0 if x < 0.5 else 1)
print("Accuracy (Loss minimization): ")
f.loc[f['pred']==f['Outcome']].shape[0] / f.shape[0] * 100

Accuracy (Loss minimization): 


65.10416666666666

In [17]:
# Using Maximum Likelihood Estimation
start_time = time.time()
num_iter = 100000

intercept2 = np.ones((X2.shape[0], 1))
X2 = np.concatenate((intercept2, X2), axis=1)
theta2 = np.zeros(X2.shape[1])

for i in range(num_iter):
    h2 = sigmoid(X2, theta2)
    gradient2 = gradient_ascent(X2, h2, y) #np.dot(X.T, (h - y)) / y.size
    theta2 = update_weight_mle(theta2, 0.1, gradient2)
    
print("Training time (Log Reg using MLE):" + str(time.time() - start_time) + "seconds")
print("Learning rate: {}\nIteration: {}".format(0.1, num_iter))


overflow encountered in exp



Training time (Log Reg using MLE):3.1666371822357178seconds
Learning rate: 0.1
Iteration: 100000


In [18]:
result2 = sigmoid(X2, theta2)


overflow encountered in exp



In [18]:
print("Accuracy (Maximum Likelihood Estimation):")
f2 = pd.DataFrame(result2).join(y)
f2.loc[f2[0]==f2['Outcome']].shape[0] / f2.shape[0] * 100

Accuracy (Maximum Likelihood Estimation):


63.151041666666664

In [19]:
# Using sklearn LogisticRegression module
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(fit_intercept=True, max_iter=100000)
clf.fit(df[['Age', 'DiabetesPedigreeFunction']], y)
print("Training time (sklearn's LogisticRegression module):" + str(time.time() - start_time) + " seconds")
print("Learning rate: {}\nIteration: {}".format(0.1, num_iter))

Training time (sklearn's LogisticRegression module):9.11507511138916 seconds
Learning rate: 0.1
Iteration: 100000


In [20]:
result3 = clf.predict(df[['Age', 'DiabetesPedigreeFunction']])

In [21]:
print("Accuracy (sklearn's Logistic Regression):")
f3 = pd.DataFrame(result3).join(y)
f3.loc[f3[0]==f3['Outcome']].shape[0] / f3.shape[0] * 100

Accuracy (sklearn's Logistic Regression):


65.36458333333334

In [None]:
# Add more features or use combination of other 2 than Age and DiabetesPedigreeFunction to achieve better accuracy