In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/train.csv')
test = pd.read_csv('/content/drive/MyDrive/kaggle_heart_disease/test.csv')

# Convert target to 0/1
train['Heart Disease'] = train['Heart Disease'].map({'Absence': 0, 'Presence': 1})

# Original features (no id)
X_orig = train.drop(['id', 'Heart Disease'], axis=1)
y = train['Heart Disease']
X_test_orig = test.drop(['id'], axis=1)
test_ids = test['id']

# Clean column names (replace spaces with underscores)
X_orig.columns = X_orig.columns.str.replace(' ', '_').str.replace('-', '_')
X_test_orig.columns = X_test_orig.columns.str.replace(' ', '_').str.replace('-', '_')

# --- Add manual features (interactions & ratios) ---
def add_manual_features(df):
    df = df.copy()
    df['thal_x_chest'] = df['Thallium'] * df['Chest_pain_type']
    df['exang_x_vessels'] = df['Exercise_angina'] * df['Number_of_vessels_fluro']
    df['stdep_x_slope'] = df['ST_depression'] * df['Slope_of_ST']
    df['age_x_maxhr'] = df['Age'] * df['Max_HR']
    df['bp_per_age'] = df['BP'] / (df['Age'] + 1)
    df['chol_per_age'] = df['Cholesterol'] / (df['Age'] + 1)
    return df

X = add_manual_features(X_orig)
X_test = add_manual_features(X_test_orig)

print(f"Train shape: {X.shape}, Test shape: {X_test.shape}")  # Should match (630000, 19) and (270000, 19)

Mounted at /content/drive
Train shape: (630000, 19), Test shape: (270000, 19)


In [None]:
! pip install optuna

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0


In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
import numpy as np

# Convert to numpy (faster, avoids pandas issues)
X_np = X.values.astype('float32')
y_np = y.values.ravel()
X_test_np = X_test.values.astype('float32')

def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
    }
    model = lgb.LGBMClassifier(**params, n_estimators=500, random_state=42, n_jobs=-1, verbose=-1)
    score = np.mean(cross_val_score(model, X_np, y_np, cv=3, scoring='roc_auc'))
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)   # 30 trials * 3 folds = 90 fits, ~30-40 min

print("Best params:", study.best_params)
print("Best CV:", study.best_value)

[I 2026-02-27 21:43:05,111] A new study created in memory with name: no-name-1b15779d-ef9d-4a41-9460-c58bfb0a88aa
[I 2026-02-27 21:44:43,267] Trial 0 finished with value: 0.9545637184872188 and parameters: {'learning_rate': 0.15461066072911364, 'num_leaves': 23, 'max_depth': 7, 'feature_fraction': 0.8865152440160899, 'bagging_fraction': 0.8176253734220291, 'bagging_freq': 9, 'min_child_samples': 44, 'reg_alpha': 0.17468659603176395, 'reg_lambda': 0.7277949176126105}. Best is trial 0 with value: 0.9545637184872188.
[I 2026-02-27 21:48:01,724] Trial 1 finished with value: 0.9548774251289011 and parameters: {'learning_rate': 0.04224612855129348, 'num_leaves': 86, 'max_depth': 8, 'feature_fraction': 0.8612982182868065, 'bagging_fraction': 0.7883953356568714, 'bagging_freq': 4, 'min_child_samples': 33, 'reg_alpha': 0.4615271527318836, 'reg_lambda': 0.20471246512612395}. Best is trial 1 with value: 0.9548774251289011.
[I 2026-02-27 21:50:55,095] Trial 2 finished with value: 0.954406408661852

Best params: {'learning_rate': 0.14178801801933952, 'num_leaves': 28, 'max_depth': 3, 'feature_fraction': 0.6775517777773777, 'bagging_fraction': 0.9179109086797103, 'bagging_freq': 4, 'min_child_samples': 41, 'reg_alpha': 0.5704863720785183, 'reg_lambda': 0.3452364749193629}
Best CV: 0.9553334725597663


In [None]:
import xgboost as xgb

# Train final LightGBM with best params
best_lgb_params = study.best_params.copy()
best_lgb_params['n_estimators'] = 1000   # can be increased if early stopping used, but here we fix

lgb_final = lgb.LGBMClassifier(**best_lgb_params, random_state=42, n_jobs=-1, verbose=-1)
lgb_final.fit(X_np, y_np)

lgb_pred = lgb_final.predict_proba(X_test_np)[:, 1]

# If you want to blend with your best XGBoost (the one that gave 0.95356), load it or retrain quickly
# Example: retrain XGBoost with best params (use the same best_xgb_params from earlier)
best_xgb_params = {
    'learning_rate': 0.150165,
    'max_depth': 4,
    'subsample': 0.9845,
    'colsample_bytree': 0.5068,
    'min_child_weight': 7,
    'gamma': 0.3314,
    'reg_alpha': 0.9975,
    'reg_lambda': 2.5372
}

xgb_model = xgb.XGBClassifier(**best_xgb_params, n_estimators=1000, random_state=42, n_jobs=-1)
xgb_model.fit(X_np, y_np)
xgb_pred = xgb_model.predict_proba(X_test_np)[:, 1]

# Blend (simple average)
blend_pred = (xgb_pred + lgb_pred) / 2

# Submission
submission = pd.DataFrame({'id': test_ids, 'Heart Disease': blend_pred})
submission.to_csv('final_blend.csv', index=False)

from google.colab import files
files.download('final_blend.csv')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

# --- Make sure X_np, y_np, X_test_np, test_ids are defined ---
# (If not, rerun the data prep cell that creates X, X_test, etc., and converts to numpy)

# Your best XGBoost params from Optuna
best_xgb_params = {
    'learning_rate': 0.150165,
    'max_depth': 4,
    'subsample': 0.9845,
    'colsample_bytree': 0.5068,
    'min_child_weight': 7,
    'gamma': 0.3314,
    'reg_alpha': 0.9975,
    'reg_lambda': 2.5372
}

# Three different seeds
seeds = [42, 123, 456]
predictions = []

for i, seed in enumerate(seeds):
    print(f"Training XGBoost {i+1} with seed {seed}...")
    model = xgb.XGBClassifier(
        **best_xgb_params,
        n_estimators=1000,
        random_state=seed,
        n_jobs=-1,
        eval_metric='auc'
    )
    model.fit(X_np, y_np)
    pred = model.predict_proba(X_test_np)[:, 1]
    predictions.append(pred)

# Average
avg_pred = np.mean(predictions, axis=0)

# Submission
submission = pd.DataFrame({'id': test_ids, 'Heart Disease': avg_pred})
submission.to_csv('xgb_ensemble_3.csv', index=False)

from google.colab import files
files.download('xgb_ensemble_3.csv')

Training XGBoost 1 with seed 42...
Training XGBoost 2 with seed 123...
Training XGBoost 3 with seed 456...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>