In [None]:
# Packages
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split

# Data Import
# Fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 

# Data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features # Features
Y = cdc_diabetes_health_indicators.data.targets   # Target Variable

# Combine data into one dataframe if needed
diabetes_data = pd.concat([X, Y], axis=1)

# Data Preprocessing
# Split dataset into training and test sets.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=14)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Address class imbalance with SMOTE over-sampling.
smote_resampler = SMOTE(random_state=45)
X_train_resampled, Y_train_resampled = smote_resampler.fit_resample(X_train_scaled, Y_train)

# Training
xg_model = XGBClassifier(
    n_estimators=100,       
    max_depth=10,     
    learning_rate=0.1,    
    objective='binary:logistic', 
    eval_metric='logloss', 
)

xg_model.fit(X_train_resampled, Y_train_resampled)

# Save the trained model using joblib
model_filename = 'diabetes_model_v0.1.pkl'
joblib.dump(xg_model, model_filename)

print(f"Model saved as {model_filename}")


In [None]:
# Saving Scaler File

# Assuming X_train is still available
scaler = StandardScaler()

# Fit the scaler on the original training data
scaler.fit(X_train)

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')
