In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
df = pd.read_csv('pima-indians-diabetes.data.csv', header = None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
columns = ['Pregnancies','Glucose','BP','Skin Thickness','Insulin','BMI',
          'Diabetes Pedigries','Age','Outcome']

In [5]:
df.columns = columns

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BP,Skin Thickness,Insulin,BMI,Diabetes Pedigries,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.shape

(768, 9)

In [8]:
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [9]:
x.shape

(768, 8)

In [10]:
y.shape

(768,)

In [11]:
x[0]

array([  6.   , 148.   ,  72.   ,  35.   ,   0.   ,  33.6  ,   0.627,
        50.   ])

In [12]:
df.shape[0]

768

In [13]:
df.shape[1]

9

In [14]:
df.values

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [15]:
# Feature Scaling - Normalization


def minMaxScaler(df):
    n = df.shape[0]
    df = df.values
    for i in range(0,df.shape[1] - 1):
        min_val = min(df[:,i])
        max_val = max(df[:,i])
        for j in range(n):
            numer = df[j][i] - min_val
            denom = max_val - min_val
            df[j][i] = numer / denom
    return df


In [16]:
df = minMaxScaler(df)

In [17]:
df[0]

array([0.35294118, 0.74371859, 0.59016393, 0.35353535, 0.        ,
       0.50074516, 0.23441503, 0.48333333, 1.        ])

In [18]:
# Divide data = Train_Test_Split using K-Fold Cross Validation Technique

def kfold(dataset,k=5):
    dataset_copy = list(dataset)
    fold_size = dataset.shape[0] // k
    folds = []
    for i in range(k):
        fold =[]
        while len(fold) < fold_size:
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        folds.append(fold)
    return folds

In [19]:
# Predict function to get the predictions

def predict(coef, row):
    x = coef[0]
    for i in range(len(row)):
        x += coef[i + 1] * row[i]
    return 1 / (1 + np.exp(-x))

In [20]:
# to find out accuracy of our model

def accuracy(y_test,y_pred):
    count = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_test[i]:
            count +=1
    return count / len(y_pred) * 100

In [26]:
# Gradient Descent
# - Batch
# - Stochastic
# - Mini Batch
def gradient_descent(x_train, y_train, epochs, alpha):
    coef = [0] * (x_train.shape[1] + 1)
    n = x_train.shape[0]
    # logic to calculate coef using stochastic gradient descent
    for epoch in range(epochs):
        for i in range(len(x_train)):
            output = predict(coef, x_train[i])
            loss = y_train[i] - output
            coef[0] = coef[0] - ((2/n) * loss) * alpha
            for j in range(x_train.shape[1]):
                coef[j + 1] = coef[j + 1] - ((2/n) * loss * x_train[i][j]) * alpha
    return coef

In [27]:
def logistic(x_train,y_train,x_test,y_test,epochs,alpha):
    coef = gradient_descent(x_train, y_train, epochs, alpha)
    predictions =[]
    for row in x_test:
        y_pred = predict(coef, row)
        predictions.append(np.round(y_pred))
    acc_score = accuracy(y_test,predictions)
    return acc_score

In [28]:
folds = kfold(df)

In [29]:
np.asarray(folds).shape

(5, 153, 9)

In [30]:
folds = np.asarray(folds)

In [31]:
folds[0]

array([[0.11764706, 0.61306533, 0.42622951, ..., 0.31511529, 0.11666667,
        0.        ],
       [0.17647059, 0.64824121, 0.52459016, ..., 0.06020495, 0.11666667,
        1.        ],
       [0.23529412, 0.72864322, 0.67213115, ..., 0.06703672, 0.81666667,
        1.        ],
       ...,
       [0.41176471, 0.59798995, 0.        , ..., 0.0559351 , 0.26666667,
        0.        ],
       [0.17647059, 0.44723618, 0.60655738, ..., 0.20196413, 0.28333333,
        0.        ],
       [0.11764706, 0.43718593, 0.47540984, ..., 0.03757472, 0.06666667,
        0.        ]])

In [32]:
def evaluate(dataset,epochs,alpha):
    folds = kfold(dataset)
    for i in range(len(folds)):
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        train = list(folds)
        train.pop(i)
        for train_fold in train:
            for data in train_fold:
                x_train.append(data[:-1])
                y_train.append(data[-1])
                
        for data in folds[i]:
            x_test.append(data[:-1])
            y_test.append(data[-1])
            
        x_train = np.asarray(x_train)
        y_train = np.asarray(y_train)
        x_test = np.asarray(x_test)
        y_test = np.asarray(y_test)
        acc = logistic(x_train,y_train,x_test,y_test,epochs,alpha)
        print("Folds : {} Accuracy is {}".format(i,acc))
        

In [33]:
epochs = 100
alpha = 0.01
evaluate(df,epochs,alpha)


Folds : 0 Accuracy is 39.869281045751634
Folds : 1 Accuracy is 34.64052287581699
Folds : 2 Accuracy is 37.254901960784316
Folds : 3 Accuracy is 32.6797385620915
Folds : 4 Accuracy is 30.718954248366014
