In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# 1. Load Data
try:
    df = pd.read_csv('heart_cleveland_upload.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: 'heart_cleveland_upload.csv' not found.")
    exit()

# ----------------------------------------------------
# STEP 1: FEATURE ENGINEERING (New)
# ----------------------------------------------------
# Create some interaction features
df['age_x_trestbps'] = df['age'] * df['trestbps']
df['age_x_chol'] = df['age'] * df['chol']
df['chol_x_trestbps'] = df['chol'] * df['trestbps']
print("Created new interaction features.")

# 2. Define Features (X) and Target (y)
X = df.drop('condition', axis=1)
y = df['condition']

# Identify which columns are continuous (need scaling) and which are not
continuous_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 
                       'age_x_trestbps', 'age_x_chol', 'chol_x_trestbps']
                       
# All other columns are treated as categorical/binary
categorical_features = [col for col in X.columns if col not in continuous_features]

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------------------------------
# STEP 2: CREATE A PREPROCESSING & MODELING PIPELINE (New)
# ----------------------------------------------------

# Create a transformer to scale continuous features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features)
    ],
    remainder='passthrough' # Keep the categorical columns as-is
)

# Create the full pipeline
# 'preprocessor' is our scaling step
# 'model' is our XGBoost classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ))
])

# ----------------------------------------------------
# STEP 3: SMARTER HYPERPARAMETER TUNING (RandomizedSearchCV)
# ----------------------------------------------------
# We expand the grid to give the model more options to learn
# Note: Parameter names must now start with 'model__' to tell the pipeline
# which step to apply the parameter to.

param_grid_random = {
    # Try more trees and a wider range
    'model__n_estimators': [100, 200, 300, 400, 500],
    # More depth options
    'model__max_depth': [3, 4, 5, 6, 7],
    # Finer learning rate control
    'model__learning_rate': [0.01, 0.05, 0.1, 0.15],
    'model__subsample': [0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    # Add regularization to prevent overfitting (important!)
    'model__gamma': [0, 0.1, 0.5, 1],
    'model__reg_alpha': [0, 0.1, 0.5, 1],
    'model__reg_lambda': [0, 0.1, 0.5, 1]
}

# n_iter=100 will try 100 different random combinations
# cv=5 is 5-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid_random,
    n_iter=100,  # Try 100 combinations (you can lower this if it's too slow)
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

print("\nStarting smarter hyperparameter tuning (RandomizedSearchCV)...")
# 5. Train the model
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

print("Tuning complete.")
print(f"Best Parameters found: {random_search.best_params_}")
print(f"Best Accuracy during tuning: {random_search.best_score_ * 100:.2f}%")

# 6. Evaluate the Best Model on the Test Set
print("\nEvaluating the best model on the test set...")
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred, target_names=['No Disease (0)', 'Disease (1)']))

# 7. Save the Model
# The 'best_model' is the entire pipeline (preprocessor + model)
# This is perfect, because now it will automatically scale new data 
# before making a prediction.
model_filename = 'xgboost_heart_pipeline_v2.pkl'
joblib.dump(best_model, model_filename)

print(f"\nModel (full pipeline) saved successfully as '{model_filename}'")

Data loaded successfully.
Created new interaction features.

Starting smarter hyperparameter tuning (RandomizedSearchCV)...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Tuning complete.
Best Parameters found: {'model__subsample': 0.9, 'model__reg_lambda': 0.5, 'model__reg_alpha': 1, 'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__gamma': 0, 'model__colsample_bytree': 0.8}
Best Accuracy during tuning: 82.77%

Evaluating the best model on the test set...
Test Set Accuracy: 83.33%

Classification Report on Test Set:
                precision    recall  f1-score   support

No Disease (0)       0.79      0.94      0.86        32
   Disease (1)       0.91      0.71      0.80        28

      accuracy                           0.83        60
     macro avg       0.85      0.83      0.83        60
  weighted avg       0.85      0.83      0.83        60


Model (full pipeline) saved successfully as 'xgboost_heart_pipeline_v2.pkl'


In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   - -------------------------------------- 1.8/72.0 MB 12.6 MB/s eta 0:00:06
   -- ------------------------------------- 5.0/72.0 MB 14.4 MB/s eta 0:00:05
   ----- ---------------------------------- 10.2/72.0 MB 17.7 MB/s eta 0:00:04
   -------- ------------------------------- 15.7/72.0 MB 19.8 MB/s eta 0:00:03
   ------------- -------------------------- 23.9/72.0 MB 23.6 MB/s eta 0:00:03
   ----------------- ---------------------- 31.2/72.0 MB 25.7 MB/s eta 0:00:02
   -------------------- ------------------- 37.7/72.0 MB 26.7 MB/s eta 0:00:02
   ------------------------- -------------- 46.7/72.0 MB 28.6 MB/s eta 0:00:01
   ----------------------------- ---------- 53.2/72.0 MB 29.2 MB/s eta 0:00:01
   --------------------------------- ------ 60.8/72.0 MB 29.6 MB/s eta 0:0


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
