In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Create the folder
# !mkdir -p "/content/drive/MyDrive/kaggle_heart_disease"

Mounted at /content/drive


In [None]:
import pandas as pd

train = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/train.csv')
test = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/sample_submission.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

Train shape: (630000, 15)
Test shape: (270000, 14)
Sample submission shape: (270000, 2)


In [None]:
# Define features and target
train['Heart Disease'] = train['Heart Disease'].map({'Absence': 0, 'Presence': 1})
X = train.drop(['id', 'Heart Disease'], axis=1)
y = train['Heart Disease']

# Keep test IDs for submission
test_ids = test['id']
X_test = test.drop(['id'], axis=1)

In [None]:
categorical_features = [
    'Sex',                # binary
    'Chest pain type',    # ordinal (1-4)
    'FBS over 120',       # binary
    'EKG results',        # ordinal (0-2)
    'Exercise angina',    # binary
    'Slope of ST',        # ordinal (1-3)
    'Number of vessels fluro',  # ordinal (0-3)
    'Thallium'            # ordinal (3,6,7)
]

numerical_features = [
    'Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression'
]

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lgb_cv_scores = []

lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
}

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nðŸ”¥ LightGBM Fold {fold+1}")

    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )

    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    lgb_cv_scores.append(auc)
    print(f"âœ… Fold {fold+1} ROC AUC: {auc:.5f}")

print(f"\nðŸŽ¯ LightGBM CV: {np.mean(lgb_cv_scores):.5f} Â± {np.std(lgb_cv_scores):.5f}")


ðŸ”¥ LightGBM Fold 1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[638]	valid_0's auc: 0.95562
âœ… Fold 1 ROC AUC: 0.95562

ðŸ”¥ LightGBM Fold 2
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[430]	valid_0's auc: 0.95462
âœ… Fold 2 ROC AUC: 0.95462

ðŸ”¥ LightGBM Fold 3
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[404]	valid_0's auc: 0.955391
âœ… Fold 3 ROC AUC: 0.95539

ðŸ”¥ LightGBM Fold 4
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[452]	valid_0's auc: 0.954927
âœ… Fold 4 ROC AUC: 0.95493

ðŸ”¥ LightGBM Fold 5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[510]	valid_0's auc: 0.955754
âœ… Fold 5 ROC AUC: 0.95575

ðŸŽ¯ LightGBM CV: 0.95526 Â± 0.00043


In [None]:
!pip install optuna --q

import optuna
import xgboost as xgb
import numpy as np
from sklearn.model_selection import cross_val_score

# Make sure X and y are loaded from previous cells

def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 3)
    }

    model = xgb.XGBClassifier(
        **params,
        n_estimators=500,                # fixed number (no early stopping needed)
        eval_metric='auc',
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False
    )

    score = np.mean(cross_val_score(model, X, y, cv=3, scoring='roc_auc'))
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)   # 50 trials ~ 30-60 min depending on data size

print("Best params:", study.best_params)
print(f"Best CV: {study.best_value:.5f}")

[I 2026-02-15 14:57:33,306] A new study created in memory with name: no-name-06325500-5db1-4b71-8ba4-8a8ab735757e
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2026-02-15 14:58:25,030] Trial 0 finished with value: 0.9544167748435841 and parameters: {'learning_rate': 0.030078397152782127, 'max_depth': 3, 'subsample': 0.8979316099843415, 'colsample_bytree': 0.96369083753472, 'min_child_weight': 1, 'gamma': 0.17150569768456364, 'reg_alpha': 0.0799831791485398, 'reg_lambda': 1.724831391441907}. Best is trial 0 with value: 0.9544167748435841.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label

Best params: {'learning_rate': 0.1501651469422689, 'max_depth': 4, 'subsample': 0.984512922430979, 'colsample_bytree': 0.5068165993248022, 'min_child_weight': 7, 'gamma': 0.3314470999702477, 'reg_alpha': 0.9974681992572415, 'reg_lambda': 2.5372271353502223}
Best CV: 0.95546


In [None]:
best_params = study.best_params.copy()
best_params.pop('n_estimators', None)
best_params['n_estimators'] = 1000  # we'll use early stopping to find optimal number

final_model = xgb.XGBClassifier(
    **best_params,
    early_stopping_rounds=50,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

# We need an eval_set for early stopping â€” split a small validation set from training
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

print(f"Best iteration: {final_model.best_iteration}")

NameError: name 'study' is not defined

In [None]:
# Retrain on full data with best_iteration trees
final_model_full = xgb.XGBClassifier(
    **best_params,
    n_estimators=final_model.best_iteration,  # use the optimal number found
    random_state=42,
    n_jobs=-1
)
final_model_full.fit(X, y)

test_preds = final_model_full.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({'id': test_ids, 'Heart Disease': test_preds})
submission.to_csv('optuna_tuned_submission.csv', index=False)

from google.colab import files
files.download('optuna_tuned_submission.csv')

TypeError: xgboost.sklearn.XGBClassifier() got multiple values for keyword argument 'n_estimators'

In [None]:
# # From your correlation list:
# top_features = ['Thallium', 'Chest pain type', 'Exercise angina',
#                 'Number of vessels fluro', 'ST depression', 'Slope of ST', 'Max HR']

# # Interactions (multiplicative)
# X['thal_x_chest'] = X['Thallium'] * X['Chest pain type']
# X['exang_x_vessels'] = X['Exercise angina'] * X['Number of vessels fluro']
# X['stdep_x_slope'] = X['ST depression'] * X['Slope of ST']
# X['age_x_maxhr'] = X['Age'] * X['Max HR']

# # Ratios (if denominator > 0)
# X['bp_per_age'] = X['BP'] / (X['Age'] + 1)
# X['chol_per_age'] = X['Cholesterol'] / (X['Age'] + 1)

# # Polynomials (degree=2) for top 3
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
# top3 = X[['Thallium', 'Chest pain type', 'Exercise angina']]
# poly_features = poly.fit_transform(top3)
# poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(top3.columns))
# X = pd.concat([X, poly_df], axis=1)

In [None]:
# # Train both on full data with best params
# xgb_final = xgb.XGBClassifier(**best_xgb_params, n_estimators=1000,
#                               early_stopping_rounds=50, random_state=42)
# xgb_final.fit(X, y, eval_set=[(X, y)], verbose=False)

# lgb_final = lgb.LGBMClassifier(**best_lgb_params, n_estimators=1000,
#                                early_stopping_rounds=50, random_state=42)
# lgb_final.fit(X, y, eval_set=[(X, y)], verbose=False)

# # Predict
# xgb_preds = xgb_final.predict_proba(X_test)[:, 1]
# lgb_preds = lgb_final.predict_proba(X_test)[:, 1]

# # Simple average
# ensemble_preds = (xgb_preds + lgb_preds) / 2

# submission = pd.DataFrame({'id': test_ids, 'Heart Disease': ensemble_preds})
# submission.to_csv('ensemble_xgb_lgb.csv', index=False)

In [None]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression
# import catboost as cb

# base_models = [
#     ('xgb', xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)),
#     ('lgb', lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=31, random_state=42)),
#     ('cb', cb.CatBoostClassifier(iterations=500, learning_rate=0.05, verbose=0, random_state=42))
# ]

# stack = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(), cv=5)
# stack.fit(X, y)
# stack_preds = stack.predict_proba(X_test)[:, 1]

# submission['Heart Disease'] = stack_preds
# submission.to_csv('stacking_submission.csv', index=False)