In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
def category_bmi(bmi):
  if bmi<18.5:
    return 'Underweight'
  elif bmi>=18.5 and bmi<=24.9:
    return 'Normal weight'
  elif bmi>=25 and bmi<=29.9:
    return 'Overweight'
  elif bmi>=30:
    return 'Obesity'
df['BMI_category'] = df['BMI'].apply(category_bmi)

In [4]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train.drop(columns=['Outcome','BMI_category']))
val_scaled = scaler.transform(val.drop(columns=['Outcome','BMI_category']))

In [6]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
train_encoded = ohe.fit_transform(train[['BMI_category']])
val_encoded = ohe.transform(val[['BMI_category']])

In [None]:
X_train = np.concatenate((train_scaled, train_encoded.toarray()), axis=1)
X_val = np.concatenate((val_scaled, val_encoded.toarray()), axis=1)
y_train = train['Outcome']
y_val = val['Outcome']

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

f1_scores = 0
best_n = 0

n = [3,5,7]
for i in n:
  knn = KNeighborsClassifier(n_neighbors=i)
  knn.fit(X_train, y_train)

  y_pred = knn.predict(X_train)

  accuracy = accuracy_score(y_train, y_pred)
  f1 = f1_score(y_train, y_pred)
  print(f"for n = {i}")
  print("for trainig:")
  print(f"Accuracy of KNN classifier: {accuracy}")
  print(f"F1 score of KNN classifier: {f1}")
  print()

  y_pred = knn.predict(X_val)

  accuracy = accuracy_score(y_val, y_pred)
  f1 = f1_score(y_val, y_pred)
  if f1 > f1_scores:
    f1_scores = f1
    best_n = i
  print("for validation:")
  print(f"Accuracy of KNN classifier: {accuracy}")
  print(f"F1 score of KNN classifier: {f1}")
  print()
  print()

print(f"highest F1 score: {f1_scores}")
print(f"best n: {best_n}")

for n = 3
for trainig:
Accuracy of KNN classifier: 0.8485342019543974
F1 score of KNN classifier: 0.7726161369193154

for validation:
Accuracy of KNN classifier: 0.7402597402597403
F1 score of KNN classifier: 0.6296296296296297


for n = 5
for trainig:
Accuracy of KNN classifier: 0.8224755700325733
F1 score of KNN classifier: 0.72264631043257

for validation:
Accuracy of KNN classifier: 0.6948051948051948
F1 score of KNN classifier: 0.5607476635514018


for n = 7
for trainig:
Accuracy of KNN classifier: 0.8192182410423453
F1 score of KNN classifier: 0.7146529562982005

for validation:
Accuracy of KNN classifier: 0.7207792207792207
F1 score of KNN classifier: 0.5904761904761905


highest F1 score: 0.6296296296296297
best n: 3


In [9]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [10]:
from sklearn.tree import DecisionTreeClassifier

f1_scores = 0
best_depth = 0

for i in [3,5,7]:
  dt_classifier = DecisionTreeClassifier(max_depth=i)
  dt_classifier.fit(X_train, y_train)

  y_pred_train = dt_classifier.predict(X_train)

  accuracy_train = accuracy_score(y_train, y_pred_train)
  f1_train = f1_score(y_train, y_pred_train)

  print(f"for depth: {i}")
  print("Decision Tree Classifier - Training Set")
  print(f"Accuracy: {accuracy_train}")
  print(f"F1 Score: {f1_train}")
  print()

  y_pred_val = dt_classifier.predict(X_val)

  accuracy_val = accuracy_score(y_val, y_pred_val)
  f1_val = f1_score(y_val, y_pred_val)

  if f1_val > f1_scores:
    f1_scores = f1_val
    best_depth = i

  print("Decision Tree Classifier - Validation Set")
  print(f"Accuracy: {accuracy_val}")
  print(f"F1 Score: {f1_val}")
  print()
  print()

print(f"highest F1 score: {f1_scores}")
print(f"best depth: {best_depth}")

for depth: 3
Decision Tree Classifier - Training Set
Accuracy: 0.7768729641693811
F1 Score: 0.6583541147132169

Decision Tree Classifier - Validation Set
Accuracy: 0.7597402597402597
F1 Score: 0.6476190476190476


for depth: 5
Decision Tree Classifier - Training Set
Accuracy: 0.8420195439739414
F1 Score: 0.7467362924281984

Decision Tree Classifier - Validation Set
Accuracy: 0.7922077922077922
F1 Score: 0.6862745098039216


for depth: 7
Decision Tree Classifier - Training Set
Accuracy: 0.9104234527687296
F1 Score: 0.8705882352941177

Decision Tree Classifier - Validation Set
Accuracy: 0.7467532467532467
F1 Score: 0.6608695652173913


highest F1 score: 0.6862745098039216
best depth: 5


In [11]:
dt_classifier = DecisionTreeClassifier(max_depth=5)
dt_classifier.fit(X_train, y_train)


In [12]:
import joblib

joblib.dump(scaler, 'scaler.pkl')
joblib.dump(ohe, 'ohe.pkl')
joblib.dump(knn, 'knn.pkl')
joblib.dump(dt_classifier, 'dt_classifier.pkl')

['dt_classifier.pkl']

In [13]:
val.to_csv('val.csv', index=False)

In [14]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'BMI_category'],
      dtype='object')