In [1]:
import numpy as np 
import pandas as pd 
import joblib

In [2]:
df = pd.read_csv('diabetes_prediction_dataset.csv') 
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
df = df[df['smoking_history'] != 'No Info'] 
df.drop(columns = ['HbA1c_level'],inplace = True)   

In [4]:
df = df[df['gender'] != 'Other']

In [5]:
df  

X = df.drop(columns = ['diabetes']) 
y = df['diabetes'] 

In [6]:
# define the columuns 
binary_columns = ["hypertension", "heart_disease"]
numerical_columns = ["age", "bmi", "blood_glucose_level"]
categorical_columns = ["smoking_history","gender"]  


In [7]:
df_encoded = pd.get_dummies(df, columns=["smoking_history", "gender"], drop_first=True)  

In [8]:
df_encoded.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,blood_glucose_level,diabetes,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,gender_Male
0,80.0,0,1,25.19,140,0,False,False,True,False,False
2,28.0,0,0,27.32,158,0,False,False,True,False,True
3,36.0,0,0,23.45,155,0,False,False,False,False,False
4,76.0,1,1,20.14,155,0,False,False,False,False,True
5,20.0,0,0,27.32,85,0,False,False,True,False,False


In [9]:
df['gender'].value_counts()

gender
Female    38852
Male      25320
Name: count, dtype: int64

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [12]:
# Fit and transform the numerical columns
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])

In [13]:
df_encoded.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,blood_glucose_level,diabetes,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,gender_Male
0,1.712053,0,1,-0.496363,0.00878,0,False,False,True,False,False
2,-0.949217,0,0,-0.169471,0.43566,0,False,False,True,False,True
3,-0.539791,0,0,-0.763401,0.364513,0,False,False,False,False,False
4,1.50734,1,1,-1.271387,0.364513,0,False,False,False,False,True
5,-1.358643,0,0,-0.169471,-1.295577,0,False,False,True,False,False


In [14]:
X = df_encoded.drop(columns = ['diabetes']) 
y = df_encoded['diabetes']

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

In [17]:
model = LogisticRegression()
model.fit(X_train, y_train)  # Train the model
y_pred = model.predict(X_test)  # Make predictions 


In [18]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}") 

Accuracy: 0.9228671601090768


In [19]:
joblib.dump(model, 'predict_diabetes_model.joblib') 

['predict_diabetes_model.joblib']

In [20]:
joblib.dump(scaler, 'scaler.joblib') 

['scaler.joblib']

In [23]:
# Save column order
column_order = df_encoded.columns
joblib.dump(column_order, 'column_order.joblib')
column_order

Index(['age', 'hypertension', 'heart_disease', 'bmi', 'blood_glucose_level',
       'diabetes', 'smoking_history_ever', 'smoking_history_former',
       'smoking_history_never', 'smoking_history_not current', 'gender_Male'],
      dtype='object')