In [1]:
import sys
print(sys.executable)


C:\Users\linus\AppData\Local\Programs\Python\Python311\python.exe


In [2]:
import numpy as np
print(np.__version__)


1.26.4


In [3]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y = train["Tm"].values
smiles_train = train["SMILES"].tolist()
smiles_test = test["SMILES"].tolist()
# Liste aller RDKit-Deskriptoren
all_desc = Descriptors._descList
desc_names = [d[0] for d in all_desc]
desc_funcs = [d[1] for d in all_desc]

len(desc_names), desc_names[:10]
def featurize_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Fallback: 0-Features
        return np.zeros(len(desc_names) + 2048, dtype=float)

    # RDKit Deskriptoren
    desc_values = []
    for f in desc_funcs:
        try:
            v = f(mol)
        except Exception:
            v = np.nan
        desc_values.append(v)

    desc_values = np.array(desc_values, dtype=float)

    # Missing durch 0 ersetzen (oder median, je nach Geschmack)
    desc_values = np.nan_to_num(desc_values, nan=0.0)

    # Morgan Fingerprint
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    fp_arr = np.array(fp, dtype=float)

    return np.concatenate([desc_values, fp_arr])
X_train = np.vstack([featurize_mol(s) for s in smiles_train])
X_test  = np.vstack([featurize_mol(s) for s in smiles_test])

X_train.shape, X_test.shape
def cv_model(model_name, model_builder, X, y, X_test, n_splits=5, random_state=42):
    from xgboost.callback import EarlyStopping as XgbEarlyStopping

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_pred = np.zeros(len(y))
    test_pred_folds = []
    models = []

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):
        print(f"\n===== {model_name} | Fold {fold} / {n_splits} =====")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        model = model_builder()

        # ---------------------------------------------------------
        # LIGHTGBM (new API â€“ ONLY LightGBM callbacks)
        # ---------------------------------------------------------
        if model_name == "LightGBM":
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                callbacks=[
                    lgb.early_stopping(50),
                    lgb.log_evaluation(period=0)
                ]
            )

        # ---------------------------------------------------------
        # XGBOOST (new API â€“ ONLY XGBoost callbacks)
        # ---------------------------------------------------------
        elif model_name == "XGBoost":
            model.fit(X_tr, y_tr)


        # ---------------------------------------------------------
        # CATBOOST (native early stopping)
        # ---------------------------------------------------------
        elif model_name == "CatBoost":
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                use_best_model=True,
                verbose=False
            )

        else:
            raise ValueError("Unknown model type!")

        # prediction
        val_pred = model.predict(X_val)
        fold_mae = mean_absolute_error(y_val, val_pred)
        print(f"{model_name} Fold {fold} MAE: {fold_mae:.4f}")

        oof_pred[val_idx] = val_pred
        test_pred_folds.append(model.predict(X_test))
        models.append(model)

    # final results
    oof_mae = mean_absolute_error(y, oof_pred)
    print(f"\n>>> {model_name} OOF MAE: {oof_mae:.4f}")

    return models, oof_pred, np.mean(test_pred_folds, axis=0), oof_mae



def build_lgbm():
    params = {
        "objective": "regression_l1",
        "metric": "l1",
        "learning_rate": 0.03,
        "num_leaves": 48,
        "max_depth": -1,
        "feature_fraction": 0.85,
        "bagging_fraction": 0.8,
        "bagging_freq": 3,
        "min_data_in_leaf": 50,
        "lambda_l1": 1.0,
        "lambda_l2": 1.0,
        "verbosity": -1,
        "n_estimators": 2000,
    }
    return lgb.LGBMRegressor(**params)

def build_xgb():
    params = {
        "objective": "reg:squarederror",
        "learning_rate": 0.03,
        "max_depth": 7,
        "subsample": 0.85,
        "colsample_bytree": 0.85,
        "n_estimators": 1500,
        "reg_alpha": 0.3,
        "reg_lambda": 1.0,
        "tree_method": "hist",
        "gamma": 0.0,
    }
    return xgb.XGBRegressor(**params)

def build_cat():
    return CatBoostRegressor(
        loss_function="MAE",
        learning_rate=0.03,
        depth=8,
        l2_leaf_reg=5.0,
        iterations=1500,
        border_count=128,
        random_seed=42,
        verbose=False
    )


models_lgbm, oof_lgbm, test_lgbm, mae_lgbm = cv_model("LightGBM", build_lgbm, X_train, y, X_test, n_splits=5)
models_xgb, oof_xgb, test_xgb, mae_xgb = cv_model("XGBoost", build_xgb, X_train, y, X_test, n_splits=5)
models_cat, oof_cat, test_cat, mae_cat = cv_model("CatBoost", build_cat, X_train, y, X_test, n_splits=5)
oof_ens_simple = (oof_lgbm + oof_xgb + oof_cat) / 3.0
mae_ens_simple = mean_absolute_error(y, oof_ens_simple)
print("\nEnsemble (simple average) OOF MAE:", mae_ens_simple)
oof_ens_weighted = 0.4 * oof_lgbm + 0.3 * oof_xgb + 0.3 * oof_cat
mae_ens_weighted = mean_absolute_error(y, oof_ens_weighted)
print("Ensemble (weighted) OOF MAE:", mae_ens_weighted)
test_ens_simple = (test_lgbm + test_xgb + test_cat) / 3.0
test_ens_weighted = 0.4 * test_lgbm + 0.3 * test_xgb + 0.3 * test_cat
sample = pd.read_csv("Submissions/sample_submission.csv")

# Einzelmodelle
sub_lgbm = sample.copy()
sub_lgbm["Tm"] = test_lgbm
sub_lgbm.to_csv("Submissions/submission_lgbm_rdkit.csv", index=False)

sub_xgb = sample.copy()
sub_xgb["Tm"] = test_xgb
sub_xgb.to_csv("Submissions/submission_xgb_rdkit.csv", index=False)

sub_cat = sample.copy()
sub_cat["Tm"] = test_cat
sub_cat.to_csv("Submissions/submission_cat_rdkit.csv", index=False)

# Ensembles
sub_ens_simple = sample.copy()
sub_ens_simple["Tm"] = test_ens_simple
sub_ens_simple.to_csv("Submissions/submission_ens_simple_rdkit.csv", index=False)

sub_ens_weighted = sample.copy()
sub_ens_weighted["Tm"] = test_ens_weighted
sub_ens_weighted.to_csv("Submissions/submission_ens_weighted_rdkit.csv", index=False)
def objective_lgbm(trial):
    params = {
        "learning_rate": trial.suggest_float("lr", 0.01, 0.1, log=True),
        "num_leaves":    trial.suggest_int("leaves", 24, 96),
        "max_depth":     trial.suggest_int("max_depth", 3, 10),
        "feature_fraction": trial.suggest_float("ff", 0.7, 1.0),
        "bagging_fraction": trial.suggest_float("bf", 0.7, 1.0),
        "bagging_freq":     trial.suggest_int("bfreq", 1, 7),
        "min_data_in_leaf": trial.suggest_int("minleaf", 20, 150),
        "lambda_l1":        trial.suggest_float("l1", 1e-8, 10.0, log=True),
        "lambda_l2":        trial.suggest_float("l2", 1e-8, 10.0, log=True),
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_pred = np.zeros(len(y))

    for tr_idx, val_idx in kf.split(X_train, y):
        X_tr_k, X_val_k = X_train[tr_idx], X_train[val_idx]
        y_tr_k, y_val_k = y[tr_idx], y[val_idx]

        model = lgb.LGBMRegressor(
            objective="regression_l1",
            metric="l1",
            n_estimators=2000,
            verbosity=-1,
            learning_rate=params["learning_rate"],
            num_leaves=params["num_leaves"],
            max_depth=params["max_depth"],
            feature_fraction=params["feature_fraction"],
            bagging_fraction=params["bagging_fraction"],
            bagging_freq=params["bagging_freq"],
            min_data_in_leaf=params["min_data_in_leaf"],
            lambda_l1=params["lambda_l1"],
            lambda_l2=params["lambda_l2"],
        )

        model.fit(
            X_tr_k, y_tr_k,
            eval_set=[(X_val_k, y_val_k)],
            early_stopping_rounds=50,
            verbose=False,
        )
        oof_pred[val_idx] = model.predict(X_val_k)

    mae_cv = mean_absolute_error(y, oof_pred)
    return mae_cv
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(direction="minimize", pruner=pruner)
study.optimize(objective_lgbm, timeout=60)

print("Best LGBM CV MAE:", study.best_value)
print("Best params:", study.best_params)


===== LightGBM | Fold 1 / 5 =====
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1994]	valid_0's l1: 30.9591
LightGBM Fold 1 MAE: 30.9591

===== LightGBM | Fold 2 / 5 =====
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 28.6413
LightGBM Fold 2 MAE: 28.6413

===== LightGBM | Fold 3 / 5 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1452]	valid_0's l1: 28.1396
LightGBM Fold 3 MAE: 28.1396

===== LightGBM | Fold 4 / 5 =====
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1999]	valid_0's l1: 29.4617
LightGBM Fold 4 MAE: 29.4617

===== LightGBM | Fold 5 / 5 =====
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's l1: 29.3733
LightGBM Fold 5 MAE: 29.3733

>>

[W 2025-11-30 13:43:04,667] Trial 0 failed with parameters: {'lr': 0.048450228693825016, 'leaves': 91, 'max_depth': 10, 'ff': 0.8758074557214812, 'bf': 0.9062005131143853, 'bfreq': 4, 'minleaf': 40, 'l1': 6.626885735826981, 'l2': 6.949893988149936e-05} because of the following error: TypeError("LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\linus\AppData\Local\Temp\ipykernel_17564\2689862664.py", line 242, in objective_lgbm
    model.fit(
TypeError: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'
[W 2025-11-30 13:43:04,726] Trial 0 failed with value None.


TypeError: LGBMRegressor.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [4]:
# ============================================================
# LEVEL-2 STACKING (Meta-Model fÃ¼r beste Kaggle-Performance)
# ============================================================

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold


# Level-1 OOF Features
X_stack = np.vstack([oof_lgbm, oof_xgb, oof_cat]).T
X_stack_test = np.vstack([test_lgbm, test_xgb, test_cat]).T

print("Shape Stacking train:", X_stack.shape)
print("Shape Stacking test:", X_stack_test.shape)

# ------------------------------------------------------------
# 1) Ridge (sehr stabil fÃ¼r Stacking)
# ------------------------------------------------------------

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_ridge = np.zeros(len(y))
test_ridge_folds = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_stack), 1):
    X_tr, X_val = X_stack[tr_idx], X_stack[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    model_ridge = Ridge(alpha=0.1)
    model_ridge.fit(X_tr, y_tr)

    # Predictions
    oof_ridge[val_idx] = model_ridge.predict(X_val)
    test_ridge_folds.append(model_ridge.predict(X_stack_test))

    print(f"Ridge Stacking Fold {fold} MAE:",
          mean_absolute_error(y_val, oof_ridge[val_idx]))

ridge_mae = mean_absolute_error(y, oof_ridge)
print("\nRidge Stacking OOF MAE:", ridge_mae)

test_ridge = np.mean(test_ridge_folds, axis=0)

# Save stacking submission
sub_stack_ridge = sample.copy()
sub_stack_ridge["Tm"] = test_ridge
sub_stack_ridge.to_csv("Submissions/submission_stack_ridge.csv", index=False)
print("Saved submission_stack_ridge.csv")


# ------------------------------------------------------------
# OPTIONAL: 2) Lasso Stacking (tends to use fewer signals)
# ------------------------------------------------------------

oof_lasso = np.zeros(len(y))
test_lasso_folds = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_stack), 1):
    X_tr, X_val = X_stack[tr_idx], X_stack[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    model_lasso = Lasso(alpha=0.001)
    model_lasso.fit(X_tr, y_tr)

    oof_lasso[val_idx] = model_lasso.predict(X_val)
    test_lasso_folds.append(model_lasso.predict(X_stack_test))

    print(f"Lasso Stacking Fold {fold} MAE:",
          mean_absolute_error(y_val, oof_lasso[val_idx]))

lasso_mae = mean_absolute_error(y, oof_lasso)
print("\nLasso Stacking OOF MAE:", lasso_mae)

test_lasso = np.mean(test_lasso_folds, axis=0)

sub_stack_lasso = sample.copy()
sub_stack_lasso["Tm"] = test_lasso
sub_stack_lasso.to_csv("Submissions/submission_stack_lasso.csv", index=False)
print("Saved submission_stack_lasso.csv")

Shape Stacking train: (2662, 3)
Shape Stacking test: (666, 3)
Ridge Stacking Fold 1 MAE: 28.150086167260266
Ridge Stacking Fold 2 MAE: 26.9482157070548
Ridge Stacking Fold 3 MAE: 27.683435511399264
Ridge Stacking Fold 4 MAE: 27.840998889533953
Ridge Stacking Fold 5 MAE: 26.696597215815924

Ridge Stacking OOF MAE: 27.463930773523842
Saved submission_stack_ridge.csv
Lasso Stacking Fold 1 MAE: 28.150085346317884
Lasso Stacking Fold 2 MAE: 26.948214474626138
Lasso Stacking Fold 3 MAE: 27.683438186931703
Lasso Stacking Fold 4 MAE: 27.84100063810828
Lasso Stacking Fold 5 MAE: 26.696596212976058

Lasso Stacking OOF MAE: 27.463931046126106
Saved submission_stack_lasso.csv


In [5]:
# ============================
# STACKING META-MODEL (A)
# ============================
import numpy as np
import pandas as pd
X_meta = np.vstack([oof_lgbm, oof_xgb, oof_cat]).T
X_meta_test = np.vstack([test_lgbm, test_xgb, test_cat]).T

from sklearn.linear_model import Ridge

meta = Ridge(alpha=1.0)
meta.fit(X_meta, y)

meta_oof_pred = meta.predict(X_meta)
meta_test_pred = meta.predict(X_meta_test)

mae_meta = mean_absolute_error(y, meta_oof_pred)
print("\nðŸš€ Stacking Meta-Model MAE:", mae_meta)

# Save prediction
sub_stack = sample.copy()
sub_stack["Tm"] = meta_test_pred
sub_stack.to_csv("Submissions/submission_stacking_ridge_meta.csv", index=False)


ðŸš€ Stacking Meta-Model MAE: 27.307076424565917


In [None]:
# ======================================================
#     XGBOOST ONLY â€“ CLEAN STRONG BASELINE MODEL
# ======================================================
from IPython.display import Audio, display

def cv_xgb_only(X, y, X_test, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_pred = np.zeros(len(y))
    test_pred_folds = []

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):
        print(f"\n===== XGBOOST ONLY | Fold {fold} / {n_splits} =====")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        model = xgb.XGBRegressor(
            objective="reg:squarederror",
            learning_rate=0.03,
            max_depth=6,
            subsample=0.85,
            colsample_bytree=0.85,
            n_estimators=10000,
            reg_alpha=0.3,
            reg_lambda=1.0,
            tree_method="hist",
            gamma=0.0,)

        # IMPORTANT: no early stopping for XGB >=2.0 sklearn API
        model.fit(X_tr, y_tr)

        pred_val = model.predict(X_val)
        fold_mae = mean_absolute_error(y_val, pred_val)
        print(f"Fold MAE: {fold_mae:.4f}")
        display(Audio("C:/Windows/Media/Windows Ding.wav", autoplay=True))
        oof_pred[val_idx] = pred_val
        test_pred_folds.append(model.predict(X_test))

    oof_mae = mean_absolute_error(y, oof_pred)
    print("\n>>> XGB ONLY OOF MAE:", oof_mae)

    test_pred_mean = np.mean(test_pred_folds, axis=0)
    return oof_pred, test_pred_mean, oof_mae


# Run XGB baseline
oof_xgb_solo, test_xgb_solo, mae_xgb_solo = cv_xgb_only(X_train, y, X_test)
print("DONE.")


===== XGBOOST ONLY | Fold 1 / 5 =====


In [None]:
# ======================================================
#     STACKING: XGB (Level-1) + Ridge (Level-2)
# ======================================================

from sklearn.linear_model import Ridge

# Level-1: XGB OOF (from cv_xgb_only)
X_stack = oof_xgb_solo.reshape(-1, 1)
X_stack_test = test_xgb_solo.reshape(-1, 1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_meta = np.zeros(len(y))
test_meta_folds = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_stack), 1):
    print(f"STACKING Fold {fold}/5")

    X_tr, X_val = X_stack[tr_idx], X_stack[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    meta = Ridge(alpha=0.1)
    meta.fit(X_tr, y_tr)

    oof_meta[val_idx] = meta.predict(X_val)
    test_meta_folds.append(meta.predict(X_stack_test))

meta_mae = mean_absolute_error(y, oof_meta)
test_meta_pred = np.mean(test_meta_folds, axis=0)

print("\n>>> STACKING XGB+Ridge OOF MAE:", meta_mae)

# SUBMISSION: XGB ONLY
sub_xgb_solo = sample.copy()
sub_xgb_solo["Tm"] = test_xgb_solo
sub_xgb_solo.to_csv("Submissions/submission_xgb_only.csv", index=False)

# SUBMISSION: XGB + Ridge Stacking
sub_xgb_stack = sample.copy()
sub_xgb_stack["Tm"] = test_meta_pred
sub_xgb_stack.to_csv("Submissions/submission_xgb_stacked.csv", index=False)