In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/pima-indians-diabetes-database?dataset_version_number=1...


100%|██████████| 8.91k/8.91k [00:00<00:00, 13.0MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/uciml/pima-indians-diabetes-database/versions/1





In [7]:
#load the data from the path
import pandas as pd
df = pd.read_csv(path + "/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
df['BMI_category'] = df['BMI'].apply(lambda x: 'Underweight' if x < 18.5 else ('Normal' if x < 25 else ('Overweight' if x < 30 else 'Obese')))
df.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_category
0,6,148,72,35,0,33.6,0.627,50,1,Obese
1,1,85,66,29,0,26.6,0.351,31,0,Overweight
2,8,183,64,0,0,23.3,0.672,32,1,Normal


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

X = df.drop(['Outcome'],axis=1)
y = df[['Outcome']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']] = scaler.fit_transform(X_train[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']]).astype(float)
X_test[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']] = scaler.transform(X_test[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']]).astype(float)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)


In [6]:
encoder = OneHotEncoder()

#encode BMI_category

y_train_encoded = encoder.fit_transform(y_train).toarray()
y_test_encoded = encoder.transform(y_test).toarray()

In [10]:
X_train = pd.get_dummies(X_train, columns=['BMI_category'],dtype=int)
X_test = pd.get_dummies(X_test, columns=['BMI_category'],dtype=int)

In [11]:
#building a KNN classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
for k in [3,5,7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train_encoded)
    y_pred = knn.predict(X_test)
    print(f"F1 Score for k = {k}: {f1_score(y_test_encoded, y_pred, average='micro')}")

F1 Score for k = 3: 0.7402597402597403
F1 Score for k = 5: 0.6948051948051948
F1 Score for k = 7: 0.7207792207792207


In [13]:
#building a decision tree with cross validation of max_depth = 3,5,7 and check f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
for depth in [3,5,7]:
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, y_train_encoded)
    y_pred = dt.predict(X_test)
    scores = f1_score(y_test_encoded, y_pred, average='micro')
    print(f"F1 Score for depth = {depth}: {scores}")


F1 Score for depth = 3: 0.7597402597402597
F1 Score for depth = 5: 0.7987012987012987
F1 Score for depth = 7: 0.7166123778501629


In [14]:
best_model = DecisionTreeClassifier(max_depth=5)
best_model.fit(X_train, y_train_encoded)
y_pred = best_model.predict(X_test)


In [15]:
#save the best_model
import joblib
joblib.dump(best_model, 'best_model.pkl')


['best_model.pkl']

In [16]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']