In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.read_csv('diabetes.csv')
data.head() 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [2]:
# no need to do EDA (Did this previously in mini-project 2)
# Using what we already know, from the data and heatmap which was created last time,
# I can utilize this data as part of feature engineering 
imp_pred = ['Age','BMI','Pregnancies','Glucose','DiabetesPedigreeFunction','Insulin'] 
# model seems to perform better without feature engineering

In [3]:
X = data.drop(columns='Outcome')
#X = data[imp_pred]
y = data['Outcome']

# Standardizing ALL the predictors
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
# Building the logistic regression model
model = LogisticRegression(max_iter=100) #data is not split, 100% training
model.fit(X_scaled, y) 

# this will provide a dataframe which will allow us to observe every coefficient
coef_output = pd.DataFrame({'Predictor': X.columns,'Coefficient': model.coef_[0]})
coef_output

Unnamed: 0,Predictor,Coefficient
0,Pregnancies,0.408804
1,Glucose,1.107349
2,BloodPressure,-0.250794
3,SkinThickness,0.00901
4,Insulin,-0.130753
5,BMI,0.696354
6,DiabetesPedigreeFunction,0.308889
7,Age,0.176551


In [5]:
coef_output['odds ratio'] = np.exp(coef_output['Coefficient']) #allows us to actually calculate odd ratios
#we need the odd ratio so we can actually interpret what each coefficient really means
coef_output

Unnamed: 0,Predictor,Coefficient,odds ratio
0,Pregnancies,0.408804,1.505017
1,Glucose,1.107349,3.026325
2,BloodPressure,-0.250794,0.778182
3,SkinThickness,0.00901,1.00905
4,Insulin,-0.130753,0.877434
5,BMI,0.696354,2.006424
6,DiabetesPedigreeFunction,0.308889,1.361911
7,Age,0.176551,1.193095


In [6]:
# Predictions
y_pred = model.predict(X_scaled)

# Finding the actual error rates
accuracy = accuracy_score(y, y_pred)
error_rate = 1 - accuracy

print(f'Training Error Rate: {error_rate}')

Training Error Rate: 0.21614583333333337


In [8]:
conf_matrix = confusion_matrix(y, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

Confusion Matrix:
[[446  54]
 [112 156]]
