In [3]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

pd.set_option("display.width", 1000)

def safe_fillna(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype.kind in 'biufc':  # numeric
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode().iloc[0] if not df[col].mode().empty else "", inplace=True)
    return df

file_path = "updated dataset.csv"
try:
    df = pd.read_csv(file_path)
    print("✅ Data loaded successfully")
except Exception as e:
    print(f"❌ Error: {e}")
    exit()

rename_cols = {
    'gender': 'Gender',
    'hypertension': 'Hyp',
    'heart_disease': 'HD',
    'smoking_history': 'Smok',
    'blood_glucose_level': 'BGL',
    'bmi': 'BMI',
    'HbA1c_level': 'HbA1c',
    'age': 'Age',
    'diabetes': 'Dia'
}
df.rename(columns=rename_cols, inplace=True)

if 'Dia' not in df.columns:
    print("❌ Target column 'diabetes' or 'Dia' not found.")
    exit()

df = safe_fillna(df)

X = df.drop('Dia', axis=1)
y = df['Dia']

X = pd.get_dummies(X, drop_first=True)
feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
print("\n✅ Model Training Completed!")

y_pred = model.predict(X_test_scaled)
print("\n🔹 Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))

with open('diabetes_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('features.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

print("\n✅ Model, Scaler & Features Saved Successfully!")

new_data = pd.DataFrame([{
    'Gender': 1,
    'Age': 45,
    'Hyp': 0,
    'HD': 0,
    'Smok': 2,
    'BMI': 28.5,
    'HbA1c': 5.8,
    'BGL': 120
}])

new_data = pd.get_dummies(new_data)
new_data = new_data.reindex(columns=feature_names, fill_value=0)

scaled_new = scaler.transform(new_data)
prediction = model.predict(scaled_new)

print("\n🔮 Prediction for sample input:", "Diabetic" if prediction[0] == 1 else "Not Diabetic")


✅ Data loaded successfully

✅ Model Training Completed!

🔹 Accuracy: 0.9591

🔹 Confusion Matrix:
 [[18126   166]
 [  652  1056]]

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     18292
           1       0.86      0.62      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000


✅ Model, Scaler & Features Saved Successfully!

🔮 Prediction for sample input: Not Diabetic


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
