
-----

# ðŸ“˜ ML Project - Week 3: Model Creation, Evaluation & Tuning
-----

## 1\. Setup & Data Preparation
**Q1:** Import the necessary libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler , RobustScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import pickle

sns.set(style="whitegrid")
%matplotlib inline

**Q2:** Load the  dataset and display the first 5 rows.

In [None]:
df = pd.read_csv('cardio_cleaned_week2.csv')
df.head()

**Q3:** Separate the dataset into Features (`X`) and Target (`y`).

In [None]:
X = df.drop(['cardio','age','bmi_cat', 'id'], axis=1)
y = df['cardio']

-----

## 2\. Train-Test Split (Step 1)
**Q4:** Split the dataset into Training(80%) and Testing (20%) .

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Shape: {X_train.shape}")
print(f"Testing Shape:  {X_test.shape}")

-----

## 3\. Normalization / Scaling (Step 2)

**Q6:** Scale the data:

In [None]:
scaler = StandardScaler()
# scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data Scaled Successfully.")

-----

## 4\. Model Training 
**Q8:** Initialize the **Model**.

In [None]:
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model = RandomForestClassifier(
#     n_estimators=200,     
#     max_depth=10,         
#     min_samples_leaf=10,   
#     min_samples_split=10,
#     random_state=42
# # )

model1 = RandomForestClassifier(
    n_estimators=300,     
    max_depth=12,          
    min_samples_split=15, 
    min_samples_leaf=5,   
    max_features='log2',  
    bootstrap=True,
    class_weight='balanced', 
    random_state=42,
    n_jobs=-1
)


In [None]:
model2 = GradientBoostingClassifier(
    n_estimators=300,        
    learning_rate=0.05,      
    max_depth=4,             
    min_samples_split=20,    
    min_samples_leaf=15,     
    subsample=0.8,           
    max_features='sqrt',     
    validation_fraction=0.1, 
    n_iter_no_change=10,     
    random_state=42
)

In [None]:

ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

model3 = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=6,
    min_child_weight=1,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=ratio,
    objective='binary:logistic',
    eval_metric='auc', 
    random_state=42,
    n_jobs=-1,
    enable_categorical=True
)

**Q9:** Train (fit) the model using the **Scaled Training Data**.

In [None]:
model1.fit(X_train_scaled, y_train)
print("Model Trained.")

In [None]:
model2.fit(X_train_scaled, y_train)
print("Model Trained.")

In [None]:
model3.fit(X_train_scaled, y_train)
print("Model Trained.")

-----

## 5. Evaluation 

**Q10:** predictions on the **Scaled Test Set** (`X_test_scaled`).

In [None]:
y_pred1 = model1.predict(X_test_scaled)

In [None]:
y_pred2 = model2.predict(X_test_scaled)

In [None]:
y_pred3 = model3.predict(X_test_scaled)

**Q11:** Calculate the **Accuracy Score**.

In [None]:
accuracy1 = accuracy_score(y_test, y_pred1)
print(f"Test Accuracy: {accuracy1 * 100:.2f}%")

In [None]:
accuracy2 = accuracy_score(y_test, y_pred2)
print(f"Test Accuracy: {accuracy2 * 100:.2f}%")

In [None]:
accuracy3 = accuracy_score(y_test, y_pred3)
print(f"Test Accuracy: {accuracy3 * 100:.2f}%")

**Q12:** Display the **Confusion Matrix** using a Heatmap.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
predictions = [y_pred1, y_pred2, y_pred3]
model_names = ['Model 1 (Random Forest)', 'Model 2 (Grad Boost)', 'Model 3 (XGBoost)']


for i, pred in enumerate(predictions):
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[i])
    axes[i].set_title(model_names[i])
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')


plt.tight_layout()
plt.show()

**Q13:** Print the full **Classification Report** (Precision, Recall, F1-Score).

In [None]:
models = [model1, model2, model3] 
model_names = ['Random Forest', 'Gradient Boosting', 'XGBoost']
predictions = [y_pred1, y_pred2, y_pred3]

print("="*60)
print("FINAL MODEL COMPARISON REPORT")
print("="*60)

for name, pred in zip(model_names, predictions):
    print(f"\n--- {name} ---")
    print(classification_report(y_test, pred))

# fpr, tpr, thresholds = roc_curve(y_test, y_prob3)
# plt.figure(figsize=(6, 4))
# plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
# plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.legend()
# plt.show()

-----

## 6. Overfitting & Underfitting Check 

**Q14:** check Overfitting

In [None]:
y_train_pred = model3.predict(X_train_scaled)
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_pred3)

print(f"Training Accuracy: {train_acc*100:.2f}%")
print(f"Testing Accuracy:  {test_acc*100:.2f}%")

if (train_acc - test_acc) > 0.10:
    print("Warning: Potential Overfitting detected.")
else:
    print("Good Fit: Train and Test scores are balanced.")

-----

## 7. Hyperparameter Tuning 

**Q16:**  Grid Search Setup.

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

**Q17:** Run Tuning:

In [None]:
grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print(f"Best CV Accuracy: {grid_search.best_score_ * 100:.2f}%")

best_model = grid_search.best_estimator_

-----

## 8\. Final Model Export

**Q20:** Save the best model .

In [None]:
data_to_save = {
    "model": best_model,
    "scaler": scaler
}




with open('cardio_model_week3.pkl', 'wb') as file:
    pickle.dump(data_to_save, file)

print("Model and Scaler saved to 'cardio_model_week3.pkl'")

In [None]:
input_data = {
    'age': 20228,
    'gender': 1,
    'height': 156,
    'weight': 85.0,
    'ap_hi': 140,
    'ap_lo': 90,
    'cholesterol': 3,
    'gluc': 1,
    'smoke': 0,
    'alco': 0,
    'active': 1
}

input_data['age_years'] = input_data['age'] / 365.25
input_data['bmi'] = input_data['weight'] / ((input_data['height'] / 100) ** 2)
input_data['MAP'] = input_data['ap_lo'] + (input_data['ap_hi'] - input_data['ap_lo']) / 3

columns = ['gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'age_years', 'bmi', 'MAP']
df_single = pd.DataFrame([input_data], columns=columns)

# Scale data
X_single_scaled = scaler.transform(df_single)

# Predict using model3 (XGBoost)
print("Input Data:")
print(df_single.iloc[0])

prediction = model3.predict(X_single_scaled)[0]
probability = model3.predict_proba(X_single_scaled)[0][1]

print("\nPrediction (0: No Cardio Disease, 1: Cardio Disease):", prediction)
print(f"Probability of Cardio Disease: {probability:.2f}")