#Diabetes Data Analysis 
*Author: Kirti Agarwal | Date: Aug 24, 2025*

In [None]:
# importing the dataset from kaggle
import kagglehub

#Downloading latest version
path = kagglehub.dataset_download("mathchi/diabetes-data-set")

print("Path to dataset files:", path)

In [None]:
# importing all necessary libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Reading the diabetes.csv file 
df=pd.read_csv("dataset/diabetes.csv")

In [None]:
# Getting information of each column 
df.info()

In [None]:
# Describing easch coloumn data 
df.describe()

In [None]:
# checking the null no. of null values in each column 
df.isnull().sum()

In [None]:
# checking the no. of 0 values in each column 
(df==0).sum()

### Handling Zero values in Critical Features 

**Glucose → Can’t be zero because the body always has some sugar in the blood.

**BloodPressure → Can’t be zero because a living person must have circulating blood pressure.

**SkinThickness → Can’t be zero because every person has some skin/fat thickness.

**Insulin → Can’t be zero because the body naturally produces at least a small amount.

**BMI → Can’t be zero because no person has zero weight or height.

In [None]:
# Selecting four columns
cols_with_zeros=["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]


In [None]:
# Replacing all the zeroes will null
df[cols_with_zeros]=df[cols_with_zeros].replace(0,np.nan)

In [None]:
# Checking null values 
df.isnull().sum()

In [None]:
# Hist Count plot of each column 
df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]].hist(figsize=(10,8))
plt.show()


###Examining distribution to handle null values 

**Glucose → looks fairly symmetric (bell-like) → using mean to handle missing values.

**BloodPressure → also looks roughly symmetric → using mean to handle missing values.

**BMI → fairly symmetric → using mean to handle missing values.

In [None]:
# Replace missing values with mean
df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].mean())
df['BMI'] = df['BMI'].fillna(df['BMI'].mean())


In [None]:
# checking the no. of 0 values in each column 
df.isnull().sum()

In [None]:
# Constructing heatmap descibing correlation among each column 

# Select relevant features
features_for_knn = ['Pregnancies', 'Glucose', 'BMI', 'Age', 'BloodPressure', 'SkinThickness', 'Insulin','DiabetesPedigreeFunction']

# Compute correlation matrix
corr_matrix = df[features_for_knn].corr()

# Plot heatmap
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


###Handling missing data for SkinThickeness and Insulin
**For SkinThickness:**
Strongest correlation is with BMI (0.65)
**For Insulin:**
Strongest correlation is with Glucose (0.58)
This suggests that BMI and Glucose can be used as a reliable predictor when imputing missing values for these features.


In [None]:
# Importing KNN Imputer for handling missing values
from sklearn.impute import KNNImputer

# Imputation for SkinThickness

# Select features that are correlated with SkinThickness
features_skin = ['BMI', 'Glucose', 'BloodPressure', 'SkinThickness']

# Initialize the KNN Imputer with k=5 neighbors
imputer_skin = KNNImputer(n_neighbors=5)

# Apply KNN imputation on the selected features
imputed_skin = imputer_skin.fit_transform(df[features_skin])

# Replace missing SkinThickness values with imputed results
df['SkinThickness'] = imputed_skin[:, features_skin.index('SkinThickness')]

# Imputation for Insulin

# Select features that are correlated with Insulin
features_insulin = ['Glucose', 'BMI', 'Age', 'Insulin']

# Initialize another KNN Imputer with k=5 neighbors
imputer_insulin = KNNImputer(n_neighbors=5)

# Apply KNN imputation on the selected features
imputed_insulin = imputer_insulin.fit_transform(df[features_insulin])

# Replace missing Insulin values with imputed results
df['Insulin'] = imputed_insulin[:, features_insulin.index('Insulin')]


In [None]:
# All the missing values have been replaced with knn inputation values 
df.isnull().sum()

In [None]:
# Checking distribution after null values replacement 
import matplotlib.pyplot as plt
df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]].hist(figsize=(10,8))
plt.show()


In [None]:
# Box Plot
# List of numeric columns
cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI","DiabetesPedigreeFunction","Age"]

# Boxplots before handling outliers
plt.figure(figsize=(12, 6))
df[cols].boxplot()
plt.title("Boxplot of Features (Before Handling Outliers)")
plt.show()

In [None]:
# log transformation for insulin as it is highly skewed 
df["Insulin_log"] = np.log1p(df["Insulin"])  
df.drop('Insulin', axis=1)

In [None]:
# Boxplot before vs after log transformation
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
df[["Insulin"]].boxplot()
plt.title("Insulin (Original)")

plt.subplot(1, 2, 2)
df[["Insulin_log"]].boxplot()
plt.title("Insulin (Log Transformed)")

plt.show()


In [None]:
# List of numeric columns
cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

# Boxplots of Skin thickness before handling outliers
plt.figure(figsize=(12, 6))
df[["SkinThickness"]].boxplot()
plt.title("Boxplot of Features (Before Handling Outliers)")
plt.show()

In [None]:
# -----------------------------------------------
# Outlier Detection for SkinThickness
# -----------------------------------------------
# Note: Anything above ~50–60 mm for SkinThickness is considered very unlikely
# based on medical/anthropometric studies.

import pandas as pd

# Function to detect outliers using the IQR (Interquartile Range) method
def detect(data, ST):
    # Calculate the first quartile (Q1, 25th percentile)
    q1 = data[ST].quantile(0.25)
    
    # Calculate the third quartile (Q3, 75th percentile)
    q3 = data[ST].quantile(0.75)
    
    # Compute the Interquartile Range (IQR = Q3 - Q1)
    IQR = q3 - q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = q1 - 1.5 * IQR
    upper_bound = q3 + 1.5 * IQR
    
    # Identify rows where SkinThickness is outside the valid range
    outliers = data[(data[ST] < lower_bound) | (data[ST] > upper_bound)]
    
    # Return detected outliers and boundary values
    return outliers, q1, q3, lower_bound, upper_bound

# Apply the outlier detection function on the 'SkinThickness' column
outliers, q1, q3, lower_bound, upper_bound = detect(df, 'SkinThickness')

# Print the total number of outliers detected
print("The number of outliers present are:", len(outliers))


In [None]:

# Removing Outliers in SkinThickness

# Filter the dataset to keep only rows where SkinThickness <= 65
# (Values above 65 mm are considered unrealistic based on medical context)
df = df[df["SkinThickness"] <= 65].copy()

# Print the updated shape of the dataset after removing outliers
print("Updated shape:", df.shape)


In [None]:
# Boxplot after handling outliers in Skin Thickness
plt.figure(figsize=(12, 6))
df[["SkinThickness"]].boxplot()
plt.title("Boxplot of Features (Before Handling Outliers)")
plt.show()

In [None]:
# Box plot of DiabetesPedigreeFunction
plt.figure(figsize=(12, 6))
df[["DiabetesPedigreeFunction"]].boxplot()
plt.title("Boxplot of PDF")
plt.show()


### PDF values between 0.1 and 2.5 are perfectly reasonable.

In our dataset, PDF values that fall between **0.1 and 2.5** are considered 
to be within a realistic and acceptable range.  
Values inside this range are perfectly reasonable and do not indicate anomalies.


## Visualization

Below we present visualizations to better understand the data distribution


In [None]:
#---------------------------
# Plot distribution of all features except 'Outcome'
# ---------------------------------------------------

# Loop through all columns 
for col in df.columns[:-1]:
    plt.figure(figsize=(6,4))  # set figure size for readability
    
    # Plot histogram with KDE (Kernel Density Estimate)
    sns.histplot(df[col], kde=True, bins=30)
    
    # Add a clear title
    plt.title(f"Distribution of {col}", fontsize=14)
    
    # Show the plot
    plt.show()



In [None]:
sns.countplot(x='Outcome', data=df)
plt.show()


In [None]:
sns.boxplot(x='Outcome', y='Glucose', data=df)
plt.show()


In [None]:

# Creating a heatmap to visualize correlations between all numerical features
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()


In [None]:
# Scatter plot colored by Outcome
plt.figure(figsize=(8,6))
sns.scatterplot(x='Glucose', y='Insulin', hue='Outcome', data=df, palette={0:'blue', 1:'red'})
plt.title('Glucose vs Insulin (Blue: Non-diabetic, Red: Diabetic)')
plt.xlabel('Glucose')
plt.ylabel('Insulin')
plt.show()


### Interpretation of Glucose vs. Insulin Plot

- **Diabetics (red):**  
  Tend to show higher glucose levels, along with either:  
  - **Lower insulin levels** (indicative of Type 1 diabetes), or  
  - **Higher insulin levels** (indicative of Type 2 diabetes).  

- **Non-diabetics (blue):**  
  Mostly cluster within a "healthy range" for both glucose and insulin values, 
  reflecting normal metabolic function.


In [None]:
# Plot pairwise relationships between all features in the dataset
# Color points by 'Outcome' to distinguish diabetics vs non-diabetics
sns.pairplot(df, hue='Outcome')
plt.show()


In [None]:
# For BMI
plt.figure(figsize=(8,6))

# Plot BMI distribution for non-diabetic patients
sns.kdeplot(data=df[df['Outcome']==0]['BMI'], label='Non-diabetic', fill=True, color='blue')

# Plot BMI distribution for diabetic patients
sns.kdeplot(data=df[df['Outcome']==1]['BMI'], label='Diabetic', fill=True, color='red')

# Add title and axis labels
plt.title('BMI Distribution: Diabetic vs Non-diabetic')
plt.xlabel('BMI')
plt.ylabel('Density')

# Show legend
plt.legend()

# Display the plot
plt.show()


## Key Observations

**BMI Distribution Shift:**  
The diabetic group (red) likely has a higher average BMI than the non-diabetic group (blue), as its density curve seems shifted to the right (toward higher BMI values).  
This aligns with known medical trends: obesity (BMI ≥30) is a major risk factor for Type 2 diabetes.

**Spread/Variability:**  
The diabetic group may show wider variability in BMI (broader curve), suggesting greater diversity in body weight among diabetics.  
The non-diabetic group’s curve is narrower, indicating most cluster around a lower BMI range.

**Overlap:**  
There’s significant overlap between groups, meaning BMI alone isn’t a perfect predictor of diabetes status (some non-diabetics have high BMIs, and vice versa).


In [None]:
#For Insulin
plt.figure(figsize=(8,6))

# Plot Insulin distribution for non-diabetic patients
sns.kdeplot(data=df[df['Outcome']==0]['Insulin'], label='Non-diabetic', fill=True, color='blue')

# Plot Insulin distribution for diabetic patients
sns.kdeplot(data=df[df['Outcome']==1]['Insulin'], label='Diabetic', fill=True, color='red')

# Add title and axis labels
plt.title('Insulin Distribution: Diabetic vs Non-diabetic')
plt.xlabel('Insulin')
plt.ylabel('Density')

# Show legend
plt.legend()

# Display the plot
plt.show()


## Key Observations

**Diabetics Have Bimodal Insulin Distribution:**  
The red curve (diabetic) has two peaks:  
- One at lower insulin levels (~0–200 μU/mL), likely representing Type 1 diabetics (insulin deficiency).  
- Another at higher insulin levels (~400–800 μU/mL), likely representing Type 2 diabetics (insulin resistance).

**Non-Diabetics Have a Unimodal, Tighter Distribution:**  
The blue curve (non-diabetic) peaks at moderate insulin levels (~50–150 μU/mL), reflecting normal metabolic function.  
Their insulin range is narrower, with very few outliers.


In [None]:
#For Glucose
plt.figure(figsize=(8,6))

# Plot Glucose distribution for non-diabetic patients
sns.kdeplot(data=df[df['Outcome']==0]['Glucose'], label='Non-diabetic', fill=True, color='blue')

# Plot Glucose distribution for diabetic patients
sns.kdeplot(data=df[df['Outcome']==1]['Glucose'], label='Diabetic', fill=True, color='red')

# Add title and axis labels
plt.title('Glucose Distribution: Diabetic vs Non-diabetic')
plt.xlabel('Glucose')
plt.ylabel('Density')

# Show legend
plt.legend()

# Display the plot
plt.show()


## Key Observations

**Clear Separation Between Groups:**  
The diabetic group (red) shows higher glucose levels overall, with a peak likely around 125–200 mg/dL (prediabetic/diabetic range).  
The non-diabetic group (blue) peaks at a lower glucose range (~70–100 mg/dL), which is the normal fasting glucose range.

**Minimal Overlap:**  
Unlike BMI or insulin, glucose shows better separability between groups, making it a stronger diagnostic marker.  
Overlap occurs around 100–125 mg/dL (prediabetic range), where some misclassification could happen.


In [None]:
#For DiabetesPedigreeFunction
plt.figure(figsize=(8,6))

# Plot DiabetesPedigreeFunction (DPF) distribution for non-diabetic patients
sns.kdeplot(data=df[df['Outcome']==0]['DiabetesPedigreeFunction'], label='Non-diabetic', fill=True, color='blue')

# Plot DiabetesPedigreeFunction (DPF) distribution for diabetic patients
sns.kdeplot(data=df[df['Outcome']==1]['DiabetesPedigreeFunction'], label='Diabetic', fill=True, color='red')

# Add title and axis labels
plt.title('DiabetesPedigreeFunction Distribution: Diabetic vs Non-diabetic')
plt.xlabel('DiabetesPedigreeFunction')
plt.ylabel('Density')

# Show legend
plt.legend()

# Display the plot
plt.show()


## Key Observation

Even people with high genetic risk (DiabetesPedigreeFunction, DPF) might not develop diabetes due to lifestyle factors (e.g., exercise, diet).  
The plot above shows that both diabetics (red) and non-diabetics (blue) have overlapping DPF values, indicating that genetics alone does not fully determine diabetes risk.


## Feature Selection


In [None]:
# Import Random Forest model
from sklearn.ensemble import RandomForestClassifier

# Separate features and target
X = df.drop("Outcome", axis=1)  # X = all features
y = df["Outcome"]               # y = target column

# Build the model
model = RandomForestClassifier(random_state=42)

# Train the model on the dataset
model.fit(X, y)

# Get feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)

# Sort and plot importance of each feature
importances.sort_values(ascending=False).plot(kind='bar')
plt.show()


#### Based on feature importance, Glucose, BMI, Insulin, Age, etc. are the most predictive features.  
#### Since all features contribute some information, we will use all features in the model.


**Data Splitting**

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


**Data Training using Random forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Build the model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Train it
rf_model.fit(X_train, y_train)


In [None]:
# Model Prediction 
y_pred = rf_model.predict(X_test)


In [None]:
# Model evaluation 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:

# Build model with class balancing
rf_model_balanced = RandomForestClassifier(
    random_state=42, 
    n_estimators=100, 
    class_weight="balanced"
)

# Train
rf_model_balanced.fit(X_train, y_train)

# Predictions
y_pred_balanced = rf_model_balanced.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred_balanced))
print("Precision:", precision_score(y_test, y_pred_balanced))
print("Recall:", recall_score(y_test, y_pred_balanced))
print("F1-score:", f1_score(y_test, y_pred_balanced))
print("\nClassification Report:\n", classification_report(y_test, y_pred_balanced))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_balanced)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
# Define a hyperparameter grid for Random Forest (number of trees, depth, min samples, class weights).
# Use GridSearchCV with 5-fold cross-validation to find the best Random Forest model based on F1 score.
# Fit the best model on the resampled training data.
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE


# Features and target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Train-test split (stratify to maintain class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Random Forest with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_res, y_train_res)

# Best model
best_rf = grid_search.best_estimator_

# Predictions
y_pred = best_rf.predict(X_test_scaled)

# Evaluate
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


### Model Development Strategy

After experimenting with **Random Forest hyperparameter tuning** using `GridSearchCV`, we observed **no significant improvement** in model performance.  

Therefore, instead of further tuning a single model, we will proceed to **train multiple machine learning models** (e.g., Logistic Regression, Decision Tree, Random Forest, XGBoost) and **compare their accuracy and other evaluation metrics**. The goal is to identify the model that performs best on our diabetes prediction task.


In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# ---------------------------
# Features and target
# ---------------------------
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# ---------------------------
# Train-test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# Scale numeric features
# ---------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# Apply SMOTE
# ---------------------------
smote = SMOTE(random_state=42, sampling_strategy=0.7)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# ---------------------------
# Define models to compare
# ---------------------------
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
}

# ---------------------------
# Train, predict, and evaluate
# ---------------------------
results = []

for name, model in models.items():
    # Train
    model.fit(X_train_res, y_train_res)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1
    })

# ---------------------------
# Convert results to DataFrame
# ---------------------------
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1-score", ascending=False).reset_index(drop=True)
print(results_df)


**Observation:**  
- Among all models, **AdaBoost achieved the highest Accuracy (72.7%) and F1-score (0.604)**, indicating better overall performance in correctly predicting diabetic and non-diabetic cases.  
- Therefore, **AdaBoost is selected as the best-performing model** for this diabetes prediction task.

In [None]:
df.drop(columns=['Insulin'], inplace=True)
# Rename the column
df.rename(columns={'Insulin_log': 'Insulin'}, inplace=True)

In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# ---------------------------
# Features and target
# ---------------------------

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# ---------------------------
# Train-test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# Scale numeric features
# ---------------------------

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# Apply SMOTE
# ---------------------------
smote = SMOTE(random_state=42, sampling_strategy=0.7)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# ---------------------------
# Cross-validation (initial AdaBoost)
# ---------------------------
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=42)  # default weak learner
ada = AdaBoostClassifier(estimator=base_estimator, random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(ada, X_train_res, y_train_res, cv=cv, scoring='f1')
print("5-Fold CV F1-scores:", cv_scores)
print("Mean CV F1-score:", cv_scores.mean())

# ---------------------------
# Hyperparameter tuning (Grid Search)
# ---------------------------
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'estimator__max_depth': [1, 2, 3]  # tune the weak tree depth
}

grid_search = GridSearchCV(
    estimator=ada,
    param_grid=param_grid,
    scoring='f1',  # focus on class 1 performance
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_res, y_train_res)

# Best AdaBoost model
best_ada = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# ---------------------------
# Final evaluation on test set
# ---------------------------
y_pred = best_ada.predict(X_test_scaled)
y_prob = best_ada.predict_proba(X_test_scaled)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


### AdaBoost Model Development and Hyperparameter Tuning

1. **Data Preparation and Scaling**  
   - Features (`X`) and target (`Outcome`) were separated from the dataset.  
   - Data was split into training (80%) and testing (20%) sets using stratification to preserve class distribution.  
   - Numeric features were standardized using `StandardScaler` to improve model performance.  
   - SMOTE was applied on the training data to address class imbalance, generating synthetic samples for the minority class.

2. **Initial Model Evaluation (Cross-Validation)**  
   - An initial **AdaBoost** model was trained using a **DecisionTreeClassifier with max_depth=1** as the weak learner.  
   - **5-fold Stratified Cross-Validation** was performed on the resampled training data to evaluate baseline F1-score.  
   - This step ensured that the model performance was stable across different splits.

3. **Hyperparameter Tuning (Grid Search)**  
   - Grid search was performed to tune the following parameters:
     - `n_estimators`: number of boosting rounds  
     - `learning_rate`: step size for updating weights  
     - `estimator__max_depth`: depth of each weak decision tree  
   - `GridSearchCV` with 5-fold stratified CV was used, optimizing the **F1-score** for the minority class.  
   - The best hyperparameters were selected to improve model performance.

4. **Final Model Evaluation**  
   - The best AdaBoost model was evaluated on the **test set**.  
   - Metrics reported include **Accuracy, ROC-AUC, and detailed Classification Report** (Precision, Recall, F1-score).  
   - This ensures that the model generalizes well to unseen data and balances class-specific performance.

**Summary:**  
This workflow combines **data balancing (SMOTE)**, **feature scaling**, **cross-validation**, and **hyperparameter tuning** to train a robust AdaBoost model for diabetes prediction.


In [None]:
import pickle

# Saving AdaBoost model using pickle
with open("ada_classification_model.pkl", "wb") as f:
    pickle.dump(best_ada, f)

# Saving StandardScaler using pickle 
with open("ada_scaler_model.pkl", "wb") as f:
    pickle.dump(scaler, f)


### Saving the Model and Scaler

I saved the trained **AdaBoost model** and the **StandardScaler** using `pickle` so they can be reused later without retraining.  

- `ada_classification_model.pkl` → stores the trained AdaBoost model  
- `ada_scaler_model.pkl` → stores the fitted scaler for consistent feature scaling
