In [6]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install rdkit-pypi

Note: you may need to restart the kernel to use updated packages.


In [8]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

def featurize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(2050)

    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    tpsa = Descriptors.TPSA(mol)

    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    fp = np.array(fp)

    return np.concatenate([[mw, logp, hbd, hba, tpsa], fp])
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X_train = np.vstack(train["SMILES"].apply(featurize_smiles))
X_test = np.vstack(test["SMILES"].apply(featurize_smiles))
y = train["Tm"].values

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y, test_size=0.2, random_state=42
)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X_train = np.vstack(train["SMILES"].apply(featurize_smiles))
X_test = np.vstack(test["SMILES"].apply(featurize_smiles))
y = train["Tm"].values

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y, test_size=0.2, random_state=42
)
def objective(trial):
    params = {
        "objective": "regression_l1",
        "metric": "l1",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("lr", 0.005, 0.2, log=True),
        "num_leaves": trial.suggest_int("leaves", 16, 128),
        "feature_fraction": trial.suggest_float("ff", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bf", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bfreq", 1, 7),
        "min_data_in_leaf": trial.suggest_int("minleaf", 10, 200),
    }

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, dtrain, valid_sets=[dval], num_boost_round=300)
    pred = model.predict(X_val)

    return mean_absolute_error(y_val, pred)
from IPython.display import clear_output

class ProgressCallback:
    def __init__(self):
        self.best = float("inf")

    def __call__(self, study, trial):
        if study.best_value < self.best:
            self.best = study.best_value
        clear_output(wait=True)
        print(f"üîÑ Optuna: Trials={len(study.trials)} | Best MAE={self.best:.4f}")
        
progress = ProgressCallback()

study = optuna.create_study(direction="minimize")
study.optimize(objective, timeout=60, callbacks=[progress])

best_params = study.best_params
best_params["objective"] = "regression_l1"
best_params["metric"] = "l1"
print("Final Params:", best_params)
dall = lgb.Dataset(X_train, label=y)
final_model = lgb.train(best_params, dall, num_boost_round=500)
pred_test = final_model.predict(X_test)

submission = pd.read_csv("Submissions/sample_submission.csv")
submission["Tm"] = pred_test
submission.to_csv("Submissions/submission_rdkit_lgbm.csv", index=False)
print("‚úîÔ∏è Saved submission_rdkit_lgbm.csv")
# -----------------------------------------
# After Optuna: print summary stats
# -----------------------------------------

print("\nüéØ Optuna finished!")
print(f"Best Validation MAE during tuning: {study.best_value:.4f}")
print("Best Parameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

# -----------------------------------------
# Evaluate BEST Optuna model on validation set
# -----------------------------------------
best_model = lgb.train(best_params, lgb.Dataset(X_train, label=y), num_boost_round=500)

# Validation predictions
val_pred_final = best_model.predict(X_val)
val_mae_final = mean_absolute_error(y_val, val_pred_final)

print(f"\nüìä Final model MAE on validation split: {val_mae_final:.4f}")

# -----------------------------------------
# Evaluate MAE on full training data
# (should be artificially low, but useful to detect underfitting)
# -----------------------------------------

train_pred_full = best_model.predict(X_train)
train_mae_full = mean_absolute_error(y, train_pred_full)

print(f"üìò MAE on full training data: {train_mae_full:.4f}")

# -----------------------------------------
# Make predictions for test.csv
# -----------------------------------------

pred_test = best_model.predict(X_test)

submission = pd.read_csv("Submissions/sample_submission.csv")
submission["Tm"] = pred_test
submission.to_csv("Submissions/submission_rdkit_lgbm.csv", index=False)

print("\n‚úîÔ∏è Saved submission_rdkit_lgbm.csv")

# -----------------------------------------
# Warn user: Test MAE is unknown
# -----------------------------------------

print("\n‚ö†Ô∏è Hinweis:")
print("Der MAE f√ºr die erzeugte submission.csv kann NICHT berechnet werden,")
print("weil Kaggle die echten Testlabels nicht ver√∂ffentlicht.")
print("Nur Kaggle selbst kann die Test-MAE nach Upload auswerten.")

üîÑ Optuna: Trials=90 | Best MAE=30.4593
Final Params: {'lr': 0.06367420563294472, 'leaves': 81, 'ff': 0.9399554701970719, 'bf': 0.8414087029933988, 'bfreq': 4, 'minleaf': 16, 'objective': 'regression_l1', 'metric': 'l1'}
‚úîÔ∏è Saved submission_rdkit_lgbm.csv

üéØ Optuna finished!
Best Validation MAE during tuning: 30.4593
Best Parameters:
  lr: 0.06367420563294472
  leaves: 81
  ff: 0.9399554701970719
  bf: 0.8414087029933988
  bfreq: 4
  minleaf: 16
  objective: regression_l1
  metric: l1

üìä Final model MAE on validation split: 17.2874
üìò MAE on full training data: 16.9838

‚úîÔ∏è Saved submission_rdkit_lgbm.csv

‚ö†Ô∏è Hinweis:
Der MAE f√ºr die erzeugte submission.csv kann NICHT berechnet werden,
weil Kaggle die echten Testlabels nicht ver√∂ffentlicht.
Nur Kaggle selbst kann die Test-MAE nach Upload auswerten.


In [9]:
pip install rdkit-pypi lightgbm xgboost catboost optuna

Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y = train["Tm"].values
smiles_train = train["SMILES"].tolist()
smiles_test = test["SMILES"].tolist()
# Liste aller RDKit-Deskriptoren
all_desc = Descriptors._descList
desc_names = [d[0] for d in all_desc]
desc_funcs = [d[1] for d in all_desc]

len(desc_names), desc_names[:10]
def featurize_mol(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # Fallback: 0-Features
        return np.zeros(len(desc_names) + 2048, dtype=float)

    # RDKit Deskriptoren
    desc_values = []
    for f in desc_funcs:
        try:
            v = f(mol)
        except Exception:
            v = np.nan
        desc_values.append(v)

    desc_values = np.array(desc_values, dtype=float)

    # Missing durch 0 ersetzen (oder median, je nach Geschmack)
    desc_values = np.nan_to_num(desc_values, nan=0.0)

    # Morgan Fingerprint
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    fp_arr = np.array(fp, dtype=float)

    return np.concatenate([desc_values, fp_arr])
X_train = np.vstack([featurize_mol(s) for s in smiles_train])
X_test  = np.vstack([featurize_mol(s) for s in smiles_test])

X_train.shape, X_test.shape
def cv_model(model_name, model_builder, X, y, X_test, n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_pred = np.zeros(len(y))
    test_pred_folds = []
    models = []

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):
        print(f"\n===== {model_name} | Fold {fold} / {n_splits} =====")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        model = model_builder()
        model.fit(X_tr, y_tr)

        val_pred = model.predict(X_val)
        fold_mae = mean_absolute_error(y_val, val_pred)
        print(f"{model_name} Fold {fold} MAE: {fold_mae:.4f}")

        oof_pred[val_idx] = val_pred
        test_pred_folds.append(model.predict(X_test))
        models.append(model)

    oof_mae = mean_absolute_error(y, oof_pred)
    print(f"\n>>> {model_name} OOF MAE: {oof_mae:.4f}")

    test_pred_mean = np.mean(test_pred_folds, axis=0)
    return models, oof_pred, test_pred_mean, oof_mae
def build_lgbm():
    params = {
        "objective": "regression_l1",  # MAE
        "metric": "l1",
        "learning_rate": 0.03,
        "num_leaves": 64,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 3,
        "min_data_in_leaf": 30,
        "verbosity": -1,
        "n_estimators": 500,
    }
    return lgb.LGBMRegressor(**params)
def build_xgb():
    params = {
        "objective": "reg:squarederror",
        "learning_rate": 0.03,
        "max_depth": 8,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "n_estimators": 600,
        "reg_alpha": 0.0,
        "reg_lambda": 1.0,
        "tree_method": "hist",
    }
    return xgb.XGBRegressor(**params)
def build_cat():
    params = {
        "loss_function": "MAE",
        "learning_rate": 0.03,
        "depth": 8,
        "l2_leaf_reg": 3.0,
        "border_count": 128,
        "iterations": 600,
        "verbose": False,
        "random_seed": 42
    }
    return CatBoostRegressor(**params)
models_lgbm, oof_lgbm, test_lgbm, mae_lgbm = cv_model(
    "LightGBM", build_lgbm, X_train, y, X_test, n_splits=5
)
models_xgb, oof_xgb, test_xgb, mae_xgb = cv_model(
    "XGBoost", build_xgb, X_train, y, X_test, n_splits=5
)
models_cat, oof_cat, test_cat, mae_cat = cv_model(
    "CatBoost", build_cat, X_train, y, X_test, n_splits=5
)
oof_ens_simple = (oof_lgbm + oof_xgb + oof_cat) / 3.0
mae_ens_simple = mean_absolute_error(y, oof_ens_simple)
print("\nEnsemble (simple average) OOF MAE:", mae_ens_simple)
oof_ens_weighted = 0.4 * oof_lgbm + 0.3 * oof_xgb + 0.3 * oof_cat
mae_ens_weighted = mean_absolute_error(y, oof_ens_weighted)
print("Ensemble (weighted) OOF MAE:", mae_ens_weighted)
test_ens_simple = (test_lgbm + test_xgb + test_cat) / 3.0
test_ens_weighted = 0.4 * test_lgbm + 0.3 * test_xgb + 0.3 * test_cat
sample = pd.read_csv("sample_submission.csv")

# Einzelmodelle
sub_lgbm = sample.copy()
sub_lgbm["Tm"] = test_lgbm
sub_lgbm.to_csv("Submissions/submission_lgbm_rdkit.csv", index=False)

sub_xgb = sample.copy()
sub_xgb["Tm"] = test_xgb
sub_xgb.to_csv("Submissions/ssubmission_xgb_rdkit.csv", index=False)

sub_cat = sample.copy()
sub_cat["Tm"] = test_cat
sub_cat.to_csv("Submissions/ssubmission_cat_rdkit.csv", index=False)

# Ensembles
sub_ens_simple = sample.copy()
sub_ens_simple["Tm"] = test_ens_simple
sub_ens_simple.to_csv("Submissions/ssubmission_ens_simple_rdkit.csv", index=False)

sub_ens_weighted = sample.copy()
sub_ens_weighted["Tm"] = test_ens_weighted
sub_ens_weighted.to_csv("Submissions/ssubmission_ens_weighted_rdkit.csv", index=False)
def objective_lgbm(trial):
    params = {
        "objective": "regression_l1",
        "metric": "l1",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("lr", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("leaves", 31, 255),
        "feature_fraction": trial.suggest_float("ff", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bf", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bfreq", 1, 7),
        "min_data_in_leaf": trial.suggest_int("minleaf", 10, 200),
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_pred = np.zeros(len(y))

    for tr_idx, val_idx in kf.split(X_train, y):
        X_tr_k, X_val_k = X_train[tr_idx], X_train[val_idx]
        y_tr_k, y_val_k = y[tr_idx], y[val_idx]

        model = lgb.LGBMRegressor(
            objective=params["objective"],
            metric=params["metric"],
            learning_rate=params["learning_rate"],
            num_leaves=params["num_leaves"],
            feature_fraction=params["feature_fraction"],
            bagging_fraction=params["bagging_fraction"],
            bagging_freq=params["bagging_freq"],
            min_data_in_leaf=params["min_data_in_leaf"],
            n_estimators=500,
            verbosity=-1,
        )

        model.fit(X_tr_k, y_tr_k)
        oof_pred[val_idx] = model.predict(X_val_k)

    mae_cv = mean_absolute_error(y, oof_pred)
    return mae_cv

study = optuna.create_study(direction="minimize")
study.optimize(objective_lgbm, timeout=60)
print("Best LGBM CV MAE:", study.best_value)
print("Best params:", study.best_params)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 982, in launch_instance
    app.start()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelapp.py", l

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 982, in launch_instance
    app.start()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelapp.py", l

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 982, in launch_instance
    app.start()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelapp.py", l

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 982, in launch_instance
    app.start()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelapp.py", l

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 982, in launch_instance
    app.start()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelapp.py", l

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\config\application.py", line 982, in launch_instance
    app.start()
  File "C:\Users\linus\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel\kernelapp.py", l

AttributeError: _ARRAY_API not found