In [None]:
''' AIM: The aim of this project is to develop a predictive model using machine learning techniques to accurately classify patients as
either diabetic or non-diabetic based on relevant features '''

In [None]:
# dataset: kaggle
# reading the file
import pandas as pd
import numpy as np
file=pd.read_csv(r"/diabetes-data.csv")
file

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
# Analyze
file.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
# creating copy to perform operations without modifying the original DataFrame
fcopy=file.copy()
# data preprocessing:
fcopy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = fcopy[['Glucose', 'BloodPressure', 'SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
print(fcopy.isnull().sum())

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [None]:
#since we have null values,we replace them by mean/median of the data
fcopy['Glucose'].fillna(fcopy['Glucose'].mean(), inplace = True)

fcopy['BloodPressure'].fillna(fcopy['BloodPressure'].mean(), inplace = True)

fcopy['SkinThickness'].fillna(fcopy['SkinThickness'].median(), inplace = True)

fcopy['Insulin'].fillna(fcopy['Insulin'].median(), inplace = True)

fcopy['BMI'].fillna(fcopy['BMI'].median(), inplace = True)

In [None]:
print(file.Outcome.value_counts())
#0= Non-Diabetics, 1= Diabetics

Outcome
0    500
1    268
Name: count, dtype: int64


In [None]:
# X= independent variable,Y= dependent variable
x=fcopy.iloc[:,:8]
y=fcopy.iloc[:,-1:]
y

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


In [None]:
#splitting data for testing/training
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test= train_test_split(x,y,train_size=0.80, random_state=0)

In [None]:
# model training
from sklearn.neighbors import KNeighborsClassifier
clas= KNeighborsClassifier(n_neighbors=7)  #k=7
clas.fit(X_train,Y_train)

In [None]:
# manual prediction
y_pred=clas.predict([[2,99,80,23,16,0,0,20]])
y_pred
# y_pred=clas.predict([[1, 200, 75, 85, 120, 40.1, 0.6, 50]])
# y_pred



array([0])

In [None]:
# predicting
y_pred=clas.predict(X_test)
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test,y_pred)
cm

array([[89, 18],
       [22, 25]])

In [None]:
# printing accuracy
from sklearn.metrics import accuracy_score
accuracy= accuracy_score(Y_test,y_pred)
accuracy

0.7402597402597403

In [None]:
features = fcopy[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]

# Compute Pearson correlation coefficients with the target variable
correlation_with_target = features.corrwith(fcopy['Outcome'])

# Print correlation coefficients
print(correlation_with_target)

Pregnancies                 0.221898
Glucose                     0.492928
BloodPressure               0.166074
SkinThickness               0.214873
Insulin                     0.203790
BMI                         0.312038
DiabetesPedigreeFunction    0.173844
Age                         0.238356
dtype: float64


In [None]:
'''Conclusion:
~  Model's performance: Accuracy=74%
~  The machine learning model demonstrated
   promising performance in predicting diabetes risk, with features such as Glucose and BMI showing significant correlations with the outcome.   s'''