In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [4]:
df = pd.read_csv("ML_dataset_diabetes.csv")

In [6]:
print("----- INFORMATION OF DATASET -----")
print('Shape of Dataset (rows x columns):', df.shape)
print('Columns Name:', df.columns.tolist())
print('Total elements in dataset:', df.size)
print('\nDatatype of attributes:\n', df.dtypes)
print('\nFirst 5 rows:\n', df.head().T)
print('\nLast 5 rows:\n', df.tail().T)
print('\nAny 5 random rows:\n', df.sample(5).T)

----- INFORMATION OF DATASET -----
Shape of Dataset (rows x columns): (768, 9)
Columns Name: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Pedigree', 'Age', 'Outcome']
Total elements in dataset: 6912

Datatype of attributes:
 Pregnancies        int64
Glucose            int64
BloodPressure      int64
SkinThickness      int64
Insulin            int64
BMI              float64
Pedigree         float64
Age                int64
Outcome            int64
dtype: object

First 5 rows:
                      0       1        2       3        4
Pregnancies      6.000   1.000    8.000   1.000    0.000
Glucose        148.000  85.000  183.000  89.000  137.000
BloodPressure   72.000  66.000   64.000  66.000   40.000
SkinThickness   35.000  29.000    0.000  23.000   35.000
Insulin          0.000   0.000    0.000  94.000  168.000
BMI             33.600  26.600   23.300  28.100   43.100
Pedigree         0.627   0.351    0.672   0.167    2.288
Age             50.000  31.00

In [8]:
print("\n----- STATISTICAL INFORMATION -----")
print(df.describe(), "\n")


----- STATISTICAL INFORMATION -----
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI    Pedigree         Age     Outcome  
count  768.000000  768.000000  768.000000  768.000000  
mean    31.992578    0.471876   33.240885    0.348958  
std      7.884160    0.331329   11.760232    0.476951  
min      0.000000    0.078000   21.000000    0.000000 

In [10]:
print("----- MISSING VALUES -----")
print(df.isnull().sum(), "\n")

----- MISSING VALUES -----
Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
Pedigree         0
Age              0
Outcome          0
dtype: int64 



In [12]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
knn = KNeighborsClassifier(n_neighbors=5)  # K = 5
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [20]:
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [22]:
print("----- K-NEAREST NEIGHBORS RESULTS -----")
print("Confusion Matrix:\n", conf_matrix)
print(f"Accuracy: {accuracy:.4f}")
print(f"Error Rate: {error_rate:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

----- K-NEAREST NEIGHBORS RESULTS -----
Confusion Matrix:
 [[79 20]
 [27 28]]
Accuracy: 0.6948
Error Rate: 0.3052
Precision: 0.5833
Recall: 0.5091


In [4]:
df.shape

(768, 9)

In [5]:
df.shape


(768, 9)

In [6]:
print('Total elements in dataset:', df.size)



Total elements in dataset: 6912
