In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# read data, remove unused column, drop rows with missing values
data = pd.read_csv("data/diabetes_prediction_dataset.csv")
data.dropna(inplace=True)
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [3]:
def factorize_objs(df: pd.DataFrame):
    for colname, dtype in zip(df.keys(), df.dtypes.to_list()):
        if dtype == np.dtypes.ObjectDType:
            vals, keys = pd.factorize(df[colname])
            df[colname] = vals
            print(keys)

factorize_objs(data)
data

Index(['Female', 'Male', 'Other'], dtype='object')
Index(['never', 'No Info', 'current', 'former', 'ever', 'not current'], dtype='object')


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,0,25.19,6.6,140,0
1,0,54.0,0,0,1,27.32,6.6,80,0
2,1,28.0,0,0,0,27.32,5.7,158,0
3,0,36.0,0,0,2,23.45,5.0,155,0
4,1,76.0,1,1,2,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,0,80.0,0,0,1,27.32,6.2,90,0
99996,0,2.0,0,0,1,17.37,6.5,100,0
99997,1,66.0,0,0,3,27.83,5.7,155,0
99998,0,24.0,0,0,0,35.42,4.0,100,0


In [4]:
target = "diabetes"
features = data.keys().to_list()
features.remove("diabetes")

X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], train_size=0.8, random_state=42)

In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='saga', max_iter=10000, random_state=42)

model.fit(X_train, y_train)

In [9]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.90      0.57      0.70      1708

    accuracy                           0.96     20000
   macro avg       0.93      0.78      0.84     20000
weighted avg       0.96      0.96      0.95     20000



In [10]:
import dill
dill.settings["recurse"] = True

test = X_test.join(y_test)
test.to_csv("clean_data/diabetes_test.csv", index=False)

with open("models/diabetes_lg.modelfile", 'wb') as f:
    dill.dump(model, f)
