In [1]:
!pip uninstall xgboost -y
!pip install xgboost==2.0.3

import xgboost as xgb
print(f"XGBoost version: {xgb.__version__}")

Found existing installation: xgboost 2.0.3
Uninstalling xgboost-2.0.3:
  Successfully uninstalled xgboost-2.0.3
Collecting xgboost==2.0.3
  Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
XGBoost version: 2.0.3


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Mount Drive if needed
from google.colab import drive
drive.mount('/content/drive')

# Load data
train = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/train.csv')
test = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/test.csv')

# Prepare
train['Heart Disease'] = train['Heart Disease'].map({'Absence': 0, 'Presence': 1})
X = train.drop(['id', 'Heart Disease'], axis=1)
y = train['Heart Disease']
test_ids = test['id']
X_test = test.drop(['id'], axis=1)

print(f"Train: {X.shape}, Test: {X_test.shape}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train: (630000, 13), Test: (270000, 13)


In [3]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold+1}")

    X_train_fold = X.iloc[train_idx]
    y_train_fold = y.iloc[train_idx]
    X_val_fold = X.iloc[val_idx]
    y_val_fold = y.iloc[val_idx]

    model = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='auc',
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50,
        use_label_encoder=False
    )

    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        verbose=False
    )

    y_pred_proba = model.predict_proba(X_val_fold)[:, 1]
    auc = roc_auc_score(y_val_fold, y_pred_proba)
    cv_scores.append(auc)

    best_iter = getattr(model, 'best_iteration', 'N/A')
    print(f"ROC AUC: {auc:.5f} | Best iter: {best_iter}")

print(f"\nCV: {np.mean(cv_scores):.5f} ± {np.std(cv_scores):.5f}")


Fold 1
ROC AUC: 0.95558 | Best iter: 484

Fold 2
ROC AUC: 0.95461 | Best iter: 471

Fold 3
ROC AUC: 0.95537 | Best iter: 476

Fold 4
ROC AUC: 0.95493 | Best iter: 443

Fold 5
ROC AUC: 0.95574 | Best iter: 522

CV: 0.95525 ± 0.00042


In [4]:
# Train on full data
final_model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

final_model.fit(X, y, verbose=False)

# Predict
test_preds = final_model.predict_proba(X_test)[:, 1]

# Submission
submission = pd.DataFrame({
    'id': test_ids,
    'Heart Disease': test_preds
})
submission.to_csv('first_submission.csv', index=False)
print("Submission ready!")

# Download
from google.colab import files
files.download('first_submission.csv')

Submission ready!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>