In [15]:
#Required Libraries
import numpy as np  
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [16]:
data=pd.read_csv("diabetes2 (1).csv")

In [17]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [18]:
data.Outcome.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [19]:
#Grab features and label from dataframe
x = data[['Pregnancies','Glucose', 'BloodPressure', 'Age', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']].values
y = data['Outcome'].values

print(x.shape)
print(y.shape)

(768, 8)
(768,)


In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_vaild,x_test,y_vaild,y_test=train_test_split(x_test,y_test,test_size=0.5,random_state=42)

print(len(x_train))
print(len(x_vaild))
print(len(x_test))

537
115
116


In [21]:
class LogisticRegression:
    
    def __init__(self, l_rate=0.01, iterations=1000):  
        self.l_rate = l_rate  
        self.iterations = iterations 
        
    def scale(self, x):  
        x_scaled = x - np.mean(x, axis=0)
        x_scaled = x_scaled / np.std(x_scaled, axis=0)
        return x_scaled
    
    def fit(self, x, y):  
        self.losses = []  
        self.theta = np.zeros((1 + x.shape[1])) 
        n = x.shape[0]
        
        x = self.scale(x)  
                
        for i in range(self.iterations):
            #Step1
            y_pred = self.theta[0] + np.dot(x, self.theta[1:])
            z = y_pred
            #Step2
            g_z =  1 / (1 + np.e**(-z))       
            
            #Step3
            cost = (1/n)*(-y * np.log(g_z) - (1 - y) * np.log(1 - g_z))
            self.losses.append(cost) 
            
            #Step4
            d_theta1 = (1/n) * np.dot(x.T, (g_z - y)) 
            d_theta0 = (1/n) * np.sum(g_z - y)  
            
            #Step5
            self.theta[1:] = self.theta[1:] - self.l_rate * d_theta1  
            self.theta[0] = self.theta[0] - self.l_rate * d_theta0       
        return self
    
    
    def predict(self, x):  
        x = self.scale(x)  
       
        y_pred = self.theta[0] + np.dot(x, self.theta[1:]) 
        z = y_pred
        g_z = 1 / (1 + np.e**(-z))
        return [1 if i > 0.4 else 0 for i in g_z] 

In [22]:
model=LogisticRegression()
model.fit(x_train,y_train)

<__main__.LogisticRegression at 0x1fc074d28f0>

In [23]:
print("theta",model.theta)

theta [-0.64304339  0.18750749  0.80107698 -0.03747003  0.34146402 -0.02341784
  0.0351371   0.5501931   0.15759423]


In [24]:
y_preed_train=model.predict(x_train)
y_preed_vaild=model.predict(x_vaild)

In [25]:
train_acc=metrics.accuracy_score(y_train, y_preed_train)
vaild_acc=metrics.accuracy_score(y_vaild, y_preed_vaild)
print("-------train accuracy-----")
print(train_acc)
print("------valid accuracy------")
print(vaild_acc)

-------train accuracy-----
0.7653631284916201
------valid accuracy------
0.6782608695652174


In [26]:
y_preed_test=model.predict(x_test)
test_acc=metrics.accuracy_score(y_test,y_preed_test)
print("------test accuracy-------")
print(test_acc)

------test accuracy-------
0.7155172413793104


In [27]:
num_spilts=5
kfold=StratifiedKFold(num_spilts,shuffle=True,random_state=1)
train_acc,test_acc=[],[]
for train_index,test_index in kfold.split(x,y):
    x_train,x_test=x[train_index],x[test_index]
    y_train,y_test=y[train_index],y[test_index]

    model.fit(x_train,y_train)
    y_pred_train=model.predict(x_train)
    y_pred_test=model.predict(x_test)
    train_acc.append(metrics.accuracy_score(y_train,y_pred_train)*100)
    test_acc.append(metrics.accuracy_score(y_test,y_pred_test)*100)
print("Train Accuracy: ",np.mean(train_acc), "+/-", np.std(train_acc))
print("Test Accuracy: ",np.mean(test_acc), "+/-", np.std(test_acc))

Train Accuracy:  75.94316887794284 +/- 1.2904658872109065
Test Accuracy:  75.2533740769035 +/- 3.187892596173201
