In [None]:
# Packages
import pandas as pd  # Data manipulation and analysis
import joblib  # Model serialization and deserialization
import matplotlib.pyplot as plt  # Plotting and visualization
from xgboost import XGBClassifier  # XGBoost classifier for gradient boosting
from sklearn.preprocessing import StandardScaler  # Data scaling and normalization
from imblearn.over_sampling import SMOTE  # SMOTE for handling imbalanced datasets
from ucimlrepo import fetch_ucirepo  # Fetch datasets from the UCI Machine Learning Repository
from sklearn.model_selection import train_test_split  # Splitting data into training and test sets

# Data Import
# Fetch dataset from the UCI Machine Learning Repository using the dataset ID.
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 

# Extract features and target variable from the dataset.
# 'features' contains the input variables, and 'targets' contains the output variable (target).
X = cdc_diabetes_health_indicators.data.features  # Input features
Y = cdc_diabetes_health_indicators.data.targets   # Target variable

# Combine features and target variable into a single DataFrame for easier manipulation (if needed).
diabetes_data = pd.concat([X, Y], axis=1)

# Data Preprocessing
# Split the data into training and testing sets.
# 30% of the data will be used for testing, and the random_state ensures reproducibility.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=14)

# Scale the features to have zero mean and unit variance.
# This is crucial for many machine learning algorithms to perform well.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform the training data
X_test_scaled = scaler.transform(X_test)  # Transform the test data using the same scaler

# Handle class imbalance by applying SMOTE to the training set.
# SMOTE generates synthetic samples for the minority class to balance the class distribution.
smote_resampler = SMOTE(random_state=45)
X_train_resampled, Y_train_resampled = smote_resampler.fit_resample(X_train_scaled, Y_train)

# Training
# Initialize the XGBoost classifier with specified hyperparameters.
# n_estimators: Number of boosting rounds
# max_depth: Maximum depth of a tree
# learning_rate: Step size shrinkage
# objective: Loss function
# eval_metric: Metric used for evaluation
xg_model = XGBClassifier(
    n_estimators=100,       # Number of boosting rounds
    max_depth=10,           # Maximum depth of each tree
    learning_rate=0.1,      # Learning rate (shrinkage)
    objective='binary:logistic',  # Binary classification problem
    eval_metric='logloss',  # Evaluation metric for the model
)

# Train the XGBoost model using the resampled training data.
xg_model.fit(X_train_resampled, Y_train_resampled)

# Save the trained model to a file using joblib.
# This allows for easy loading and re-use of the model without retraining.
model_filename = 'diabetes_model_v0.1.pkl'
joblib.dump(xg_model, model_filename)

# Print a confirmation message with the filename of the saved model.
print(f"Model saved as {model_filename}")


In [None]:
# Saving Scaler File

# Import the StandardScaler class from sklearn.preprocessing
# and joblib for saving the scaler to a file
from sklearn.preprocessing import StandardScaler
import joblib

# Initialize a StandardScaler object, which will be used to standardize features
# by removing the mean and scaling to unit variance
scaler = StandardScaler()

# Fit the scaler to the original training data (X_train)
# This calculates the mean and standard deviation for each feature in X_train,
# which will be used to transform both training and test data in the future
scaler.fit(X_train)

# Save the fitted scaler object to a file named 'scaler.pkl'
# The 'joblib.dump' function serializes the scaler object into a file,
# which allows it to be reused later without needing to refit on the data
joblib.dump(scaler, 'scaler.pkl')
