In [2]:
import sys
print(sys.executable)


C:\Users\linus\AppData\Local\Programs\Python\Python311\python.exe


In [3]:
import numpy as np
print(np.__version__)


1.26.4


In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from sklearn.decomposition import PCA

# ---------------------------
# RDKit SETUP
# ---------------------------

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y = train["Tm"].values
smiles_train = train["SMILES"].tolist()
smiles_test = test["SMILES"].tolist()

# RDKit Deskriptorliste
all_desc = Descriptors._descList
desc_names = [d[0] for d in all_desc]
desc_funcs = [d[1] for d in all_desc]

# ---------------------------
# Feature Engineering
# ---------------------------

def featurize_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(len(desc_names) + 2048, dtype=float)

    # RDKit chemische Deskriptoren
    desc_values = []
    for f in desc_funcs:
        try:
            v = f(mol)
        except:
            v = np.nan
        desc_values.append(v)

    desc_values = np.nan_to_num(np.array(desc_values, dtype=float))

    # Morgan Fingerprints (2048-bit)
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    fp_arr = np.array(fp, dtype=float)

    return np.concatenate([desc_values, fp_arr])


# Baue X_train UND X_test
X_train = np.vstack([featurize_mol(s) for s in smiles_train])
X_test  = np.vstack([featurize_mol(s) for s in smiles_test])

print("Original Feature Form:", X_train.shape)   # z.B. (n, 2248)

# ---------------------------
# PCA NUR auf Fingerprints
# ---------------------------

# Fingerprint Teil extrahieren
FP_train = X_train[:, -2048:]
FP_test  = X_test[:, -2048:]

# PCA transformiert 2048 â†’ 128 Dimensionen
pca = PCA(n_components=128, random_state=42)
FP_train_pca = pca.fit_transform(FP_train)
FP_test_pca  = pca.transform(FP_test)

# Beschreibende Features (alle auÃŸer FP)
DESC_train = X_train[:, :-2048]
DESC_test  = X_test[:, :-2048]

# ---------------------------
# PCA + Deskriptoren zusammenfÃ¼gen
# ---------------------------

X_train_pca = np.hstack([DESC_train, FP_train_pca])
X_test_pca  = np.hstack([DESC_test, FP_test_pca])

print("Neue Feature-Form:", X_train_pca.shape)

def cv_model(model_name, model_builder, X, y, X_test_pca, n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_pred = np.zeros(len(y))
    test_pred_folds = []
    models = []

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):
        print(f"\n===== {model_name} | Fold {fold} / {n_splits} =====")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        model = model_builder()
        model.fit(X_tr, y_tr)

        val_pred = model.predict(X_val)
        fold_mae = mean_absolute_error(y_val, val_pred)
        print(f"{model_name} Fold {fold} MAE: {fold_mae:.4f}")

        oof_pred[val_idx] = val_pred
        test_pred_folds.append(model.predict(X_test_pca))
        models.append(model)

    oof_mae = mean_absolute_error(y, oof_pred)
    print(f"\n>>> {model_name} OOF MAE: {oof_mae:.4f}")

    test_pred_mean = np.mean(test_pred_folds, axis=0)
    return models, oof_pred, test_pred_mean, oof_mae
def build_lgbm():
    params = {
        "objective": "regression_l1",  # MAE
        "metric": "l1",
        "learning_rate": 0.03,
        "num_leaves": 64,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 3,
        "min_data_in_leaf": 30,
        "verbosity": -1,
        "n_estimators": 500,
    }
    return lgb.LGBMRegressor(**params)
def build_xgb():
    params = {
        "objective": "reg:squarederror",
        "learning_rate": 0.03,
        "max_depth": 12,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "n_estimators": 600,
        "reg_alpha": 0.0,
        "reg_lambda": 1.0,
        "tree_method": "hist",
    }
    return xgb.XGBRegressor(**params)
def build_cat():
    params = {
        "loss_function": "MAE",
        "learning_rate": 0.03,
        "depth": 12,
        "l2_leaf_reg": 3.0,
        "border_count": 128,
        "iterations": 600,
        "verbose": False,
        "random_seed": 42,
        "train_dir": None  # deaktiviert catboost_info vollstÃ¤ndig
    }
    return CatBoostRegressor(**params)

models_lgbm, oof_lgbm, test_lgbm, mae_lgbm = cv_model(
    "LightGBM", build_lgbm, X_train_pca, y, X_test_pca, n_splits=5
)
models_xgb, oof_xgb, test_xgb, mae_xgb = cv_model(
    "XGBoost", build_xgb, X_train_pca, y, X_test_pca, n_splits=5
)
models_cat, oof_cat, test_cat, mae_cat = cv_model(
    "CatBoost", build_cat, X_train_pca, y, X_test_pca, n_splits=5
)
oof_ens_simple = (oof_lgbm + oof_xgb + oof_cat) / 3.0
mae_ens_simple = mean_absolute_error(y, oof_ens_simple)
print("\nEnsemble (simple average) OOF MAE:", mae_ens_simple)
oof_ens_weighted = 0.4 * oof_lgbm + 0.3 * oof_xgb + 0.3 * oof_cat
mae_ens_weighted = mean_absolute_error(y, oof_ens_weighted)
print("Ensemble (weighted) OOF MAE:", mae_ens_weighted)
test_ens_simple = (test_lgbm + test_xgb + test_cat) / 3.0
test_ens_weighted = 0.4 * test_lgbm + 0.3 * test_xgb + 0.3 * test_cat
sample = pd.read_csv("Submissions/sample_submission.csv")

# Einzelmodelle
sub_lgbm = sample.copy()
sub_lgbm["Tm"] = test_lgbm
sub_lgbm.to_csv("Submissions/submission_lgbm_rdkit.csv", index=False)

sub_xgb = sample.copy()
sub_xgb["Tm"] = test_xgb
sub_xgb.to_csv("Submissions/submission_xgb_rdkit.csv", index=False)

sub_cat = sample.copy()
sub_cat["Tm"] = test_cat
sub_cat.to_csv("Submissions/submission_cat_rdkit.csv", index=False)

# Ensembles
sub_ens_simple = sample.copy()
sub_ens_simple["Tm"] = test_ens_simple
sub_ens_simple.to_csv("Submissions/submission_ens_simple_rdkit.csv", index=False)

sub_ens_weighted = sample.copy()
sub_ens_weighted["Tm"] = test_ens_weighted
sub_ens_weighted.to_csv("Submissions/submission_ens_weighted_rdkit.csv", index=False)
def objective_lgbm(trial):
    params = {
        "objective": "regression_l1",
        "metric": "l1",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("lr", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("leaves", 31, 255),
        "feature_fraction": trial.suggest_float("ff", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bf", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bfreq", 1, 7),
        "min_data_in_leaf": trial.suggest_int("minleaf", 10, 200),
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_pred = np.zeros(len(y))

    for tr_idx, val_idx in kf.split(X_train_pca, y):
        X_tr_k, X_val_k = X_train_pca[tr_idx], X_train_pca[val_idx]
        y_tr_k, y_val_k = y[tr_idx], y[val_idx]

        model = lgb.LGBMRegressor(
            objective=params["objective"],
            metric=params["metric"],
            learning_rate=params["learning_rate"],
            num_leaves=params["num_leaves"],
            feature_fraction=params["feature_fraction"],
            bagging_fraction=params["bagging_fraction"],
            bagging_freq=params["bagging_freq"],
            min_data_in_leaf=params["min_data_in_leaf"],
            n_estimators=500,
            verbosity=-1,
        )

        model.fit(X_tr_k, y_tr_k)
        oof_pred[val_idx] = model.predict(X_val_k)

    mae_cv = mean_absolute_error(y, oof_pred)
    return mae_cv

study = optuna.create_study(direction="minimize")
study.optimize(objective_lgbm, timeout=3000)
print("Best LGBM CV MAE:", study.best_value)
print("Best params:", study.best_params)
      
def objective_weights(trial):

    w1 = trial.suggest_float("w_lgbm", 0.0, 1.0)
    w2 = trial.suggest_float("w_xgb", 0.0, 1.0)
    w3 = trial.suggest_float("w_cat", 0.0, 1.0)
    w4 = trial.suggest_float("w_stack", 0.0, 1.0)

    s = w1 + w2 + w3 + w4 + 1e-9
    w1, w2, w3, w4 = w1/s, w2/s, w3/s, w4/s

    oof_combo = (
        w1 * oof_lgbm +
        w2 * oof_xgb +
        w3 * oof_cat +
        w4 * meta_oof_pred
    )

    return mean_absolute_error(y, oof_combo)

study_w = optuna.create_study(direction="minimize")
study_w.optimize(objective_weights, n_trials=200)

print("Best ensemble weights:", study_w.best_params)
wp = study_w.best_params
w1, w2, w3, w4 = wp["w_lgbm"], wp["w_xgb"], wp["w_cat"], wp["w_stack"]
s = w1 + w2 + w3 + w4
w1, w2, w3, w4 = w1/s, w2/s, w3/s, w4/s

final_preds = (
    w1 * test_lgbm +
    w2 * test_xgb +
    w3 * test_cat +
    w4 * meta_test_pred
)

sub_weighted = sample.copy()
sub_weighted["Tm"] = final_preds
sub_weighted.to_csv("Submissions/submission_stacked_optuna_weights.csv", index=False)

In [None]:
# ============================================================
# LEVEL-2 STACKING (Meta-Model fÃ¼r beste Kaggle-Performance)
# ============================================================

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold


# Level-1 OOF Features
X_stack = np.vstack([oof_lgbm, oof_xgb, oof_cat]).T
X_stack_test = np.vstack([test_lgbm, test_xgb, test_cat]).T

print("Shape Stacking train:", X_stack.shape)
print("Shape Stacking test:", X_stack_test.shape)

# ------------------------------------------------------------
# 1) Ridge (sehr stabil fÃ¼r Stacking)
# ------------------------------------------------------------

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_ridge = np.zeros(len(y))
test_ridge_folds = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_stack), 1):
    X_tr, X_val = X_stack[tr_idx], X_stack[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    model_ridge = Ridge(alpha=0.1)
    model_ridge.fit(X_tr, y_tr)

    # Predictions
    oof_ridge[val_idx] = model_ridge.predict(X_val)
    test_ridge_folds.append(model_ridge.predict(X_stack_test))

    print(f"Ridge Stacking Fold {fold} MAE:",
          mean_absolute_error(y_val, oof_ridge[val_idx]))

ridge_mae = mean_absolute_error(y, oof_ridge)
print("\nRidge Stacking OOF MAE:", ridge_mae)

test_ridge = np.mean(test_ridge_folds, axis=0)

# Save stacking submission
sub_stack_ridge = sample.copy()
sub_stack_ridge["Tm"] = test_ridge
sub_stack_ridge.to_csv("Submissions/submission_stack_ridge.csv", index=False)
print("Saved submission_stack_ridge.csv")


# ------------------------------------------------------------
# OPTIONAL: 2) Lasso Stacking (tends to use fewer signals)
# ------------------------------------------------------------

oof_lasso = np.zeros(len(y))
test_lasso_folds = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_stack), 1):
    X_tr, X_val = X_stack[tr_idx], X_stack[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    model_lasso = Lasso(alpha=0.001)
    model_lasso.fit(X_tr, y_tr)

    oof_lasso[val_idx] = model_lasso.predict(X_val)
    test_lasso_folds.append(model_lasso.predict(X_stack_test))

    print(f"Lasso Stacking Fold {fold} MAE:",
          mean_absolute_error(y_val, oof_lasso[val_idx]))

lasso_mae = mean_absolute_error(y, oof_lasso)
print("\nLasso Stacking OOF MAE:", lasso_mae)

test_lasso = np.mean(test_lasso_folds, axis=0)

sub_stack_lasso = sample.copy()
sub_stack_lasso["Tm"] = test_lasso
sub_stack_lasso.to_csv("Submissions/submission_stack_lasso.csv", index=False)
print("Saved submission_stack_lasso.csv")


In [None]:
# ============================
# STACKING META-MODEL (A)
# ============================

X_meta = np.vstack([oof_lgbm, oof_xgb, oof_cat]).T
X_meta_test = np.vstack([test_lgbm, test_xgb, test_cat]).T

from sklearn.linear_model import Ridge

meta = Ridge(alpha=1.0)
meta.fit(X_meta, y)

meta_oof_pred = meta.predict(X_meta)
meta_test_pred = meta.predict(X_meta_test)

mae_meta = mean_absolute_error(y, meta_oof_pred)
print("\nðŸš€ Stacking Meta-Model MAE:", mae_meta)

# Save prediction
sub_stack = sample.copy()
sub_stack["Tm"] = meta_test_pred
sub_stack.to_csv("Submissions/submission_stacking_ridge_meta.csv", index=False)
