In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
%matplotlib inline
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split 

In [2]:
df = pd.read_csv("Pima_Indian_diabetes.csv")

In [3]:
df.describe()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,742.0,752.0,768.0,746.0,768.0,757.0,768.0,749.0,768.0
mean,3.866601,119.966097,68.886078,20.309879,79.799479,31.711151,0.471876,33.761336,0.348958
std,3.479971,32.367659,19.427448,15.974523,115.244002,8.544789,0.331329,12.297409,0.476951
min,-5.412815,0.0,-3.496455,-11.94552,0.0,-16.288921,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.1,0.24375,24.0,0.0
50%,3.0,116.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.0,80.0,32.0,127.25,36.5,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
df = df.drop(columns = ['SkinThickness'])
features = ['Pregnancies','Glucose', 'BloodPressure', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [5]:
for feature in features:
    df[feature].fillna(value=df[feature].mean(), inplace=True)

In [6]:
zero_insulin_data = df[ df.Insulin == 0 ]
non_zero_insulin_data = df[ df.Insulin != 0]

train_X = non_zero_insulin_data['Glucose'].values.reshape(-1,1)
train_y = non_zero_insulin_data['Insulin'].values.reshape(-1,1)
val_X = zero_insulin_data['Glucose'].values.reshape(-1,1)

model = LinearRegression()
model.fit(train_X, train_y)
predicted_y = model.predict(val_X)

j = 0
for i in df.index:
    if df.at[i, 'Insulin'] == 0:
        df.at[i, 'Insulin'] = predicted_y[j][0]
        j+=1


In [7]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.866601,119.966097,68.886078,152.778203,31.711151,0.471876,33.761336,0.348958
std,3.42048,32.028277,19.427448,98.66818,8.483295,0.331329,12.144139,0.476951
min,-5.412815,0.0,-3.496455,-109.89624,-16.288921,0.078,21.0,0.0
25%,1.0,99.0,62.0,90.0,27.275,0.24375,24.0,0.0
50%,3.0,117.0,72.0,135.0,32.0,0.3725,30.0,0.0
75%,6.0,139.0,80.0,190.075164,36.425,0.62625,41.0,1.0
max,17.0,199.0,122.0,846.0,67.1,2.42,81.0,1.0


In [8]:
for feature in features:
    df.loc[df[feature] < 0, feature] = 0

In [9]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.885689,119.966097,68.895389,153.368229,31.760002,0.471876,33.761336,0.348958
std,3.384891,32.028277,19.393614,97.423387,8.261584,0.331329,12.144139,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,90.0,27.275,0.24375,24.0,0.0
50%,3.0,117.0,72.0,135.0,32.0,0.3725,30.0,0.0
75%,6.0,139.0,80.0,190.075164,36.425,0.62625,41.0,1.0
max,17.0,199.0,122.0,846.0,67.1,2.42,81.0,1.0


In [10]:
df.loc[df.Insulin >250, 'Insulin'] = 250

In [11]:
df.isnull().describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768,768,768,768,768,768,768,768
unique,1,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False,False
freq,768,768,768,768,768,768,768,768


In [12]:
accuracy = []

for i in range(100):
    df = df.sample(frac=1)
    X = df[df.columns[0:7]]
    y = df[df.columns[7]] 
    
    trainX, valX, trainY, valY = train_test_split(X, y, test_size = 0.20)
    lr = LogisticRegression(max_iter=2000, solver='lbfgs')
    lr.fit(trainX, trainY)
    accuracy.append(lr.score(valX, valY)*100)

In [13]:
average_accuracy = sum(accuracy)/len(accuracy)

In [14]:
print(average_accuracy)

76.75974025974023
