In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [22]:
df = pd.read_csv(r"C:\Users\snapp\Desktop\AI\Dataset\diabetes.csv")

In [23]:
df.shape

(768, 9)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [25]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [26]:
# These columns can not have zero values 
# Replace Zeros
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI' ]
for column in zero_not_accepted: 
    df[column]=df[column].replace(0,np.NaN)
    mean = int(df[column].mean(skipna=True))
    df[column]=df[column].replace(np.NaN,mean)

In [27]:
# Split Data 
X = df.iloc[:,0:8]
y = df.iloc[:,8]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=101)

In [28]:
# Scale Data 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [29]:
# Choose number of neighbors - it should be odd 
import math 
math.sqrt(len(y_test))

12.409673645990857

In [30]:
knn = KNeighborsClassifier(n_neighbors = 11, p=2 , metric='euclidean')
knn.fit(X_train,y_train)

In [31]:
y_pred = knn.predict(X_test)

In [32]:
# Evaluate Model 
# F1 score is less than thw accuracy score and  always is more telling 
print('f1 score:',round(f1_score(y_test,y_pred),3))
print('accuracy_score:',round(accuracy_score(y_test, y_pred),3))
print('precision_score:',round(precision_score(y_test, y_pred),3))
print('recall_score:',round(recall_score(y_test, y_pred),3))

cm = confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

f1 score: 0.72
accuracy_score: 0.818
precision_score: 0.735
recall_score: 0.706


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,90,13,103
1,15,36,51
All,105,49,154


F1 Score = 2 * (Precision * Recall) / (Precision + Recall) - 

Recall = n of true positive that predecited as 1 / total number of positive(TP / TP + FN)

Precision = how many of the returned hits were true positive (TP / TP + FP)

Accuracy = (TP + TN)/(TP + TN + FP + FN)

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.87      0.87       103
           1       0.73      0.71      0.72        51

    accuracy                           0.82       154
   macro avg       0.80      0.79      0.79       154
weighted avg       0.82      0.82      0.82       154



In [36]:
# Calculate With Formula 
accuracy = np.where(y_pred == y_test, 1, 0).sum() / float(len(y_test))*100
accuracy 

81.81818181818183

In [48]:
results = pd.DataFrame()
results['Test'] = y_test
results['Predicted']=y_pred

In [92]:
positive = results[results['Test']==1]['Test'].sum()
negative = results[results['Test']==0]['Test'].sum()
true_positive = results[(results['Test']==1) & (results['Predicted']==1) ]['Test'].count()
false_positive = results[(results['Test']==0) & (results['Predicted']==1) ]['Predicted'].count()
false_negative = results[(results['Test']==1) & (results['Predicted']==0) ]['Test'].count()
true_negative = results[(results['Test']==0) & (results['Predicted']==0) ]['Predicted'].count()

In [98]:
recall = true_positive / (true_positive + false_negative)
recall

0.7058823529411765

In [100]:
precision = true_positive / (true_positive + false_positive)
precision

0.7346938775510204

In [102]:
accuracy = ( true_positive + true_negative ) / ( true_positive + true_negative + false_positive + false_negative )
accuracy

0.8181818181818182

![MHodz.png](attachment:MHodz.png)