In [1]:
from google.colab import drive
drive.mount('/content/drive')
# Create the folder
# !mkdir -p "/content/drive/MyDrive/kaggle_heart_disease"

Mounted at /content/drive


In [2]:
import pandas as pd

train = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/train.csv')
test = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

Train shape: (630000, 15)
Test shape: (270000, 14)
Sample submission shape: (270000, 2)


In [3]:
# Define features and target
train['Heart Disease'] = train['Heart Disease'].map({'Absence': 0, 'Presence': 1})
X = train.drop(['id', 'Heart Disease'], axis=1)
y = train['Heart Disease']

# Keep test IDs for submission
test_ids = test['id']
X_test = test.drop(['id'], axis=1)

In [4]:
categorical_features = [
    'Sex',                # binary
    'Chest pain type',    # ordinal (1-4)
    'FBS over 120',       # binary
    'EKG results',        # ordinal (0-2)
    'Exercise angina',    # binary
    'Slope of ST',        # ordinal (1-3)
    'Number of vessels fluro',  # ordinal (0-3)
    'Thallium'            # ordinal (3,6,7)
]

numerical_features = [
    'Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression'
]

In [5]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lgb_cv_scores = []

lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
}

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nüî• LightGBM Fold {fold+1}")

    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )

    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    lgb_cv_scores.append(auc)
    print(f"‚úÖ Fold {fold+1} ROC AUC: {auc:.5f}")

print(f"\nüéØ LightGBM CV: {np.mean(lgb_cv_scores):.5f} ¬± {np.std(lgb_cv_scores):.5f}")


üî• LightGBM Fold 1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[638]	valid_0's auc: 0.95562
‚úÖ Fold 1 ROC AUC: 0.95562

üî• LightGBM Fold 2
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[430]	valid_0's auc: 0.95462
‚úÖ Fold 2 ROC AUC: 0.95462

üî• LightGBM Fold 3
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[404]	valid_0's auc: 0.955391
‚úÖ Fold 3 ROC AUC: 0.95539

üî• LightGBM Fold 4
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[452]	valid_0's auc: 0.954927
‚úÖ Fold 4 ROC AUC: 0.95493

üî• LightGBM Fold 5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[510]	valid_0's auc: 0.955754
‚úÖ Fold 5 ROC AUC: 0.95575

üéØ LightGBM CV: 0.95526 ¬± 0.00043


In [6]:
!pip install optuna --q

import optuna
import xgboost as xgb
import numpy as np
from sklearn.model_selection import cross_val_score

# Make sure X and y are loaded from previous cells

def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 3)
    }

    model = xgb.XGBClassifier(
        **params,
        n_estimators=500,                # fixed number (no early stopping needed)
        eval_metric='auc',
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False
    )

    score = np.mean(cross_val_score(model, X, y, cv=3, scoring='roc_auc'))
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)   # 50 trials ~ 30-60 min depending on data size

print("Best params:", study.best_params)
print(f"Best CV: {study.best_value:.5f}")

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/413.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m413.9/413.9 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h

[I 2026-02-20 22:29:47,632] A new study created in memory with name: no-name-e479f5c9-6fdf-4328-9b4a-a7c61364ea47
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2026-02-20 22:30:49,878] Trial 0 finished with value: 0.9543074471980949 and parameters: {'learning_rate': 0.08964923804749991, 'max_depth': 7, 'subsample': 0.6846380107463494, 'colsample_bytree': 0.9748432020772209, 'min_child_weight': 2, 'gamma': 0.13588958543443913, 'reg_alpha': 0.18100612007058803, 'reg_lambda': 1.1143749620200922}. Best is trial 0 with value: 0.9543074471980949.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_la

Best params: {'learning_rate': 0.10796297366057209, 'max_depth': 4, 'subsample': 0.8209812937137574, 'colsample_bytree': 0.503646232629211, 'min_child_weight': 2, 'gamma': 0.40319071874163315, 'reg_alpha': 0.9284337490805418, 'reg_lambda': 1.0334018678509929}
Best CV: 0.95546


In [7]:
best_params = study.best_params.copy()
best_params.pop('n_estimators', None)
best_params['n_estimators'] = 1000  # we'll use early stopping to find optimal number

final_model = xgb.XGBClassifier(
    **best_params,
    early_stopping_rounds=50,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

# We need an eval_set for early stopping ‚Äî split a small validation set from training
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

print(f"Best iteration: {final_model.best_iteration}")

Best iteration: 430


In [11]:
# Remove n_estimators from best_params so we can pass it separately
params_for_final = best_params.copy()
params_for_final.pop('n_estimators', None)   # delete if present

final_model_full = xgb.XGBClassifier(
    **params_for_final,
    n_estimators=final_model.best_iteration,  # use the optimal number found
    random_state=42,
    n_jobs=-1
)
final_model_full.fit(X, y)

test_preds = final_model_full.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({'id': test_ids, 'Heart Disease': test_preds})
submission.to_csv('optuna_tuned_submission.csv', index=False)

from google.colab import files
files.download('optuna_tuned_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>