<a href="https://colab.research.google.com/github/irenenjoki/machine-learning/blob/main/dataset_diabetes_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('/content/diabetes.csv')

# Display the first few rows of the dataset
print(data.head())

# Check for zero entries in columns that shouldn't have zeros
print((data == 0).sum())

# Replace zeros with NaN in specific columns
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols_with_zeros] = data[cols_with_zeros].replace({0: np.nan})

# Display summary of the dataset to check for NaNs
print(data.info())

# Fill NaNs with the median value of each column
imputer = SimpleImputer(strategy='median')
data[cols_with_zeros] = imputer.fit_transform(data[cols_with_zeros])

# Verify that there are no more NaNs
print(data.info())

# Check the distribution of the data
print(data.describe())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64
<c

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Separate features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the models
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(random_state=42)

# Define k-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Evaluate KNN with cross-validation
knn_scores = cross_val_score(knn, X_scaled, y, cv=kf, scoring='accuracy')
print(f'KNN Mean Accuracy: {knn_scores.mean()}')
print(f'KNN Standard Deviation: {knn_scores.std()}')

# Evaluate Decision Tree with cross-validation
dt_scores = cross_val_score(dt, X, y, cv=kf, scoring='accuracy')
print(f'Decision Tree Mean Accuracy: {dt_scores.mean()}')
print(f'Decision Tree Standard Deviation: {dt_scores.std()}')


KNN Mean Accuracy: 0.735663021189337
KNN Standard Deviation: 0.059518898446810464
Decision Tree Mean Accuracy: 0.706971975393028
Decision Tree Standard Deviation: 0.06324424749779516
