In [None]:
%%capture
%pip install -r requirements.txt;

### Importing packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt
pd.set_option('float_format', '{:.2f}'.format)

### Loading data

In [None]:
X_columns = ['Number of pregnancies', 'Glucose', 'Blood Pressure', 'Skin thickness', 'Insulin', 'BMI', 'Inheritance', 'Age']
y_column = 'Has Diabetes'

# For all columns except 'Number of pregnanices' 0 indicates a missing value
na_values = {col: 0 for col in X_columns if col != 'Number of pregnancies'}
data = pd.read_csv('data.csv', names=[*X_columns, y_column], na_values=na_values)

# Dropping all missing values (for simplicity)
data = data.dropna()
X = data.iloc[:, 0:-1]
y = data.iloc[:, -1]
display(data.head())
display(data.describe())

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

X_scaled = MinMaxScaler().fit(X).transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=6)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

### Niave Bayes

In [None]:
from sklearn.naive_bayes import CategoricalNB

def transform_to_bins(col):
    percentiles = [25, 50, 75]
    boundaries = np.percentile(col, percentiles)
    return np.digitize(col, boundaries, right=True)

In [None]:
X_nb = X.copy(deep = True)

for column in X_columns:
    col = np.array(X_nb[column])
    X_nb[column] = transform_to_bins(col)

X_nb.describe()

In [None]:
X_nb_train, X_nb_test, _, _ = train_test_split(X_nb, y, test_size=0.2)

In [None]:
nb_model = CategoricalNB(force_alpha=True, min_categories=4)
nb_model.fit(X_nb_train, y_train)
y_pred_naive_bayes = nb_model.predict(X_nb_test)

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

y_pred_logreg = logreg_model.predict(X_test)

## Evaluation of models

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, ConfusionMatrixDisplay

def display_model_evaluation(y_true, y_pred, model_name=""):
    print (f'Accuracy score is {accuracy_score(y_test, y_pred)}')
    print (f'F1 score is {f1_score(y_test, y_pred)}')
    print ('Confusion Matrix')
    cm = confusion_matrix(y_true, y_pred)
    ConfusionMatrixDisplay(cm).plot(cmap=plt.cm.Blues)

### KNN

In [None]:
display_model_evaluation(y_test, y_pred_knn)

### Naive Bayes

In [None]:
display_model_evaluation(y_test, y_pred_naive_bayes)

### Logistic regression

In [None]:
display_model_evaluation(y_test, y_pred_logreg)