In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

# ── 1. LOAD DATA ──────────────────────────────────────────
df = pd.read_csv("diabetes.csv")

# ── 2. PREPROCESSING ──────────────────────────────────────
cols_with_zero = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.nan)

def median_target(var):
    temp = df[df[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

columns = df.columns.drop("Outcome")
for i in columns:
    df.loc[(df['Outcome'] == 0) & (df[i].isnull()), i] = median_target(i)[i][0]
    df.loc[(df['Outcome'] == 1) & (df[i].isnull()), i] = median_target(i)[i][1]

# ── 3. OUTLIER CAPPING ────────────────────────────────────
def outlier_thresholds(dataframe, variable):
    q1 = dataframe[variable].quantile(0.25)
    q3 = dataframe[variable].quantile(0.75)
    iqr = q3 - q1
    return q1 - 1.5 * iqr, q3 + 1.5 * iqr

def replace_with_thresholds(dataframe, columns):
    for var in columns:
        low, up = outlier_thresholds(dataframe, var)
        dataframe.loc[dataframe[var] < low, var] = low
        dataframe.loc[dataframe[var] > up, var] = up

numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].astype(float)
replace_with_thresholds(df, numeric_cols)

# ── 4. FEATURE ENGINEERING ────────────────────────────────
NewBMI = pd.Series(["Underweight", "Normal", "Overweight", "Obesity 1", "Obesity 2", "Obesity 3"], dtype="category")
df["NewBMI"] = NewBMI[5]  # default
df.loc[df["BMI"] < 18.5, "NewBMI"] = NewBMI[0]
df.loc[(df["BMI"] >= 18.5) & (df["BMI"] <= 24.9), "NewBMI"] = NewBMI[1]
df.loc[(df["BMI"] > 24.9) & (df["BMI"] <= 29.9), "NewBMI"] = NewBMI[2]
df.loc[(df["BMI"] > 29.9) & (df["BMI"] <= 34.9), "NewBMI"] = NewBMI[3]
df.loc[(df["BMI"] > 34.9) & (df["BMI"] <= 39.9), "NewBMI"] = NewBMI[4]
df.loc[df["BMI"] > 39.9, "NewBMI"] = NewBMI[5]

df['New_Glucose'] = pd.cut(
    x=df['Glucose'],
    bins=[0, 74, 99, 139, 200],
    labels=["Low", "Normal", "Overweight", "High"]
)

def set_insulin(row):
    return "Normal" if 16 <= row["Insulin"] <= 166 else "Abnormal"
df["NewInsulinScore"] = df.apply(set_insulin, axis=1)

# ── 5. ONE-HOT ENCODING ───────────────────────────────────
categorical_columns = [col for col in df.columns
                       if len(df[col].unique()) <= 10 and col != "Outcome"]
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
df.columns = df.columns.str.replace(" ", "_")

# ── 6. SPLIT & SCALE ──────────────────────────────────────
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Save column order — critical for prediction later
feature_columns = list(X.columns)
with open("feature_columns.pkl", "wb") as f:
    pickle.dump(feature_columns, f)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ── 7. TRAIN BEST MODEL ───────────────────────────────────
model = LGBMClassifier(
    random_state=42,
    class_weight='balanced',
    n_estimators=300,
    learning_rate=0.05
)
model.fit(X_train, y_train)

# ── 8. SAVE .pkl FILES ────────────────────────────────────
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("✅ model.pkl, scaler.pkl, feature_columns.pkl saved!")

[LightGBM] [Info] Number of positive: 214, number of negative: 400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 662
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
✅ model.pkl, scaler.pkl, feature_columns.pkl saved!
