# Machine Learning Algorithms Comparitive Analysis for prediction of A1C levels


### Importing Libraries


In [None]:
# Importing Libraries for Data Manipulation
import numpy as np
import pandas as pd

# Importing Libraries for Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.inspection import permutation_importance

# Importing Libraries for Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import (
    RandomForestRegressor,
)
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Importing Libraries for Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

### Importing Dataset


In [None]:
data = pd.read_csv("../../../Dataset/a1c-estimation-dataset.csv")
data.head()

In [None]:
# Checking for null values
data.isna().sum()

In [None]:
# Handle duplicates
duplicate_rows = data[data.duplicated()]
print("Number of duplicate rows: ", duplicate_rows.shape)

data = data.drop_duplicates()
duplicate_rows = data[data.duplicated()]
print("Number of duplicate rows: ", duplicate_rows.shape)

In [None]:
# Checking for data types
data.info()

# Coverting columns to lower case
data.columns = map(str.lower, data.columns)

In [None]:
# Remove unneccessary values (0.00195%)
data = data[data["gender"] != "Other"]

In [None]:
# Describe the data
data.describe().style.format("{:.2f}")

### Data Visualization


In [None]:
# Plotting the distribution of class labels
class_counts = data["diabetes"].value_counts()
colors = ["teal", "skyblue", "coral", "gold", "lightcoral"]

# Create the pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    class_counts,
    labels=class_counts.index,
    autopct="%1.1f%%",
    startangle=140,
    colors=colors,
)
plt.axis("equal")
plt.title("Distribution of Class Labels")
plt.show()

In [None]:
# Histogram for Age distribution
plt.figure(figsize=(7, 4))
sns.histplot(data["age"], kde=True)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.grid(True)
plt.show()

In [None]:
# Bar plot for gender
sns.countplot(x="gender", data=data, palette="Set3")
plt.title("Gender Distribution")
plt.show()

In [None]:
# Distribution plot for BMI with color
sns.displot(data["bmi"], bins=30, color="teal", kde=True)
plt.title("BMI Distribution")
plt.show()

In [None]:
# Boxplot blood glucose level vs Diabetes classification
sns.boxplot(x="diabetes", y="blood_glucose_level", data=data, palette="magma")
plt.title("Blood Glucose Level vs Diabetes")
plt.show()

In [None]:
# Boxplot HbA1c level vs Diabetes classification
sns.boxplot(x="diabetes", y="hba1c_level", data=data, palette="viridis")
plt.title("HbA1c level vs Diabetes")
plt.show()

In [None]:
# Scatter plot for Blood Glucose Level vs. HbA1c Level
plt.figure(figsize=(10, 6))
sns.scatterplot(x="blood_glucose_level", y="hba1c_level", hue="diabetes", data=data)
plt.title("Blood Glucose Level vs. HbA1c Level")
plt.xlabel("Blood Glucose Level")
plt.ylabel("HbA1c Level")
plt.grid(True)
plt.show()

### Data Preprocessing


In [None]:
# Define a dictionary to map the existing categories to new ones
smoking_mapping = {
    "never": "non-smoker",
    "No Info": "non-smoker",
    "current": "current",
    "ever": "past_smoker",
    "former": "past_smoker",
    "not current": "past_smoker",
}

# Apply the mapping to the 'smoking_history' column
data["smoking_history"] = data["smoking_history"].map(smoking_mapping)

# Check the new value counts
print(data["smoking_history"].value_counts())

In [None]:
# Convert all categorical variables to numeric using one-hot encoding
data_encoded = pd.get_dummies(data)

In [None]:
# Correlation Heatmap with the encoded data
plt.figure(figsize=(7, 5))
sns.heatmap(data_encoded.corr(), annot=True, cmap="magma", fmt=".2f", linewidths=0.1)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            StandardScaler(),
            ["age", "bmi", "blood_glucose_level", "hypertension", "heart_disease"],
        ),
        (
            "cat",
            "passthrough",
            [
                "gender_Female",
                "gender_Male",
                "smoking_history_current",
                "smoking_history_non-smoker",
                "smoking_history_past_smoker",
            ],
        ),
    ]
)

In [None]:
# Define the variables X and y
X = data_encoded.drop("hba1c_level", axis=1)
y = data_encoded["hba1c_level"]

# Remove the unnecessary features
X = X.drop(["diabetes"], axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Define the models with their best parameters
models = {
    "Linear Regression": {
        "model": LinearRegression(fit_intercept=False),
    },
    "Random Forest": {
        "model": RandomForestRegressor(
            n_estimators=100, min_samples_split=2, max_depth=5
        ),
        # "max_depth": [None, 5, 10],
    },
    "XGBoost": {
        "model": XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1),
    },
    "KNN": {
        "model": KNeighborsRegressor(weights="uniform", n_neighbors=10),
    },
}

# Create a DataFrame to store the results
results = pd.DataFrame(columns=["Model", "MSE", "MAE", "R2"])

# Train each model and calculate its MSE, MAE, and R2 score
for name, model_info in models.items():
    model = model_info["model"]

    # Create a pipeline with the preprocessor and the model
    reg = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("regressor", model),
        ]
    )

    # Fit the model
    reg.fit(X_train, y_train)

    # Make predictions
    y_pred = reg.predict(X_test)

    # Calculate the metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Perform cross-validation
    mse_scores = cross_val_score(
        reg, X_train, y_train, cv=5, scoring="neg_mean_squared_error"
    )
    mae_scores = cross_val_score(
        reg, X_train, y_train, cv=5, scoring="neg_mean_absolute_error"
    )
    r2_scores = cross_val_score(reg, X_train, y_train, cv=5, scoring="r2")

    print(f"Cross Validation for {name}: ")
    print(f"MSE: {-mse_scores.mean():.2f}")
    print(f"MAE: {-mae_scores.mean():.2f}")
    print(f"R2: {r2_scores.mean():.2f}")
    print()

    # Append the results to the DataFrame
    results.loc[len(results)] = [name, mse, mae, r2]

    # Print the feature importances
    if hasattr(model, "feature_importances_"):
        print(f"Feature importances for {name}:")
        for feature, importance in zip(X_train.columns, model.feature_importances_):
            print(f"{feature}: {importance}")
    else:
        print(f"Permutation importances for {name}:")
        X_test_preprocessed = reg.named_steps["preprocessor"].transform(X_test)
        r = permutation_importance(
            model, X_test_preprocessed, y_test, n_repeats=15, random_state=0
        )
        for i in r.importances_mean.argsort()[::-1]:
            if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
                print(f"{X_test.columns[i]}: {r.importances_mean[i]:.3f}")

    print()

# Print the results
results

In [None]:
# Plot the results
plt.figure(figsize=(10, 5))
sns.barplot(x="Model", y="R2", data=results, palette="magma")
plt.title("Model R2 Score Comparison")
plt.show()

### Build and Train the Models


In [None]:
# # Define the models and their respective hyperparameters
# models = {
#     "Linear Regression": {
#         "model": LinearRegression(),
#         "params": {"fit_intercept": [True, False]},
#     },
#     "Random Forest": {
#         "model": RandomForestRegressor(random_state=42),
#         "params": {
#             "n_estimators": [10, 50, 100],
#             "max_depth": [None, 5, 10],
#             "min_samples_split": [2, 5, 10],
#         },
#     },
#     "SVR": {
#         "model": SVR(),
#         "params": {
#             "C": [0.1, 1, 10, 100],
#             "gamma": ["scale", "auto"],
#             "kernel": ["linear", "rbf"],
#         },
#     },
#     "XGBoost": {
#         "model": XGBRegressor(random_state=42),
#         "params": {
#             "learning_rate": [0.01, 0.1, 0.2],
#             "max_depth": [3, 5, 10],
#             "n_estimators": [50, 100],
#         },
#     },
#     "KNN": {
#         "model": KNeighborsRegressor(),
#         "params": {
#             "n_neighbors": [3, 5, 10],
#             "weights": ["uniform", "distance"],
#             "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
#         },
#     },
# }

# # Create a DataFrame to store the results
# results = pd.DataFrame(columns=["Model", "Best Params", "MSE", "MAE", "R2"])

# # Train each model and calculate its MSE, MAE, and R2 score
# for name, model in models.items():
#     reg = Pipeline(
#         steps=[
#             ("preprocessor", preprocessor),
#             (
#                 "regressor",
#                 RandomizedSearchCV(
#                     model["model"], model["params"], n_iter=10, cv=3, n_jobs=-1
#                 ),
#             ),
#         ]
#     )

#     reg.fit(X_train, y_train)

#     y_pred = reg.predict(X_test)

#     mse = mean_squared_error(y_test, y_pred)
#     mae = mean_absolute_error(y_test, y_pred)
#     r2 = r2_score(y_test, y_pred)

#     # Append the results to the DataFrame
#     results.loc[len(results)] = [
#         name,
#         reg.named_steps["regressor"].best_params_,
#         mse,
#         mae,
#         r2,
#     ]

# # Plot the results
# plt.figure(figsize=(10, 5))
# sns.barplot(x="Model", y="R2", data=results, palette="magma")
# plt.title("Model R2 Score Comparison")
# plt.show()