In [6]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install rdkit-pypi

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

def featurize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(2050)

    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    tpsa = Descriptors.TPSA(mol)

    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
    fp = np.array(fp)

    return np.concatenate([[mw, logp, hbd, hba, tpsa], fp])
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X_train = np.vstack(train["SMILES"].apply(featurize_smiles))
X_test = np.vstack(test["SMILES"].apply(featurize_smiles))
y = train["Tm"].values

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y, test_size=0.2, random_state=42
)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X_train = np.vstack(train["SMILES"].apply(featurize_smiles))
X_test = np.vstack(test["SMILES"].apply(featurize_smiles))
y = train["Tm"].values

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y, test_size=0.2, random_state=42
)
def objective(trial):
    params = {
        "objective": "regression_l1",
        "metric": "l1",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("lr", 0.005, 0.2, log=True),
        "num_leaves": trial.suggest_int("leaves", 16, 128),
        "feature_fraction": trial.suggest_float("ff", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bf", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bfreq", 1, 7),
        "min_data_in_leaf": trial.suggest_int("minleaf", 10, 200),
    }

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, dtrain, valid_sets=[dval], num_boost_round=300)
    pred = model.predict(X_val)

    return mean_absolute_error(y_val, pred)
from IPython.display import clear_output

class ProgressCallback:
    def __init__(self):
        self.best = float("inf")

    def __call__(self, study, trial):
        if study.best_value < self.best:
            self.best = study.best_value
        clear_output(wait=True)
        print(f"üîÑ Optuna: Trials={len(study.trials)} | Best MAE={self.best:.4f}")
        
progress = ProgressCallback()

study = optuna.create_study(direction="minimize")
study.optimize(objective, timeout=60, callbacks=[progress])

best_params = study.best_params
best_params["objective"] = "regression_l1"
best_params["metric"] = "l1"
print("Final Params:", best_params)
dall = lgb.Dataset(X_train, label=y)
final_model = lgb.train(best_params, dall, num_boost_round=500)
pred_test = final_model.predict(X_test)

submission = pd.read_csv("Submissions/sample_submission.csv")
submission["Tm"] = pred_test
submission.to_csv("Submissions/submission_rdkit_lgbm.csv", index=False)
print("‚úîÔ∏è Saved submission_rdkit_lgbm.csv")
# -----------------------------------------
# After Optuna: print summary stats
# -----------------------------------------

print("\nüéØ Optuna finished!")
print(f"Best Validation MAE during tuning: {study.best_value:.4f}")
print("Best Parameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

# -----------------------------------------
# Evaluate BEST Optuna model on validation set
# -----------------------------------------
best_model = lgb.train(best_params, lgb.Dataset(X_train, label=y), num_boost_round=500)

# Validation predictions
val_pred_final = best_model.predict(X_val)
val_mae_final = mean_absolute_error(y_val, val_pred_final)

print(f"\nüìä Final model MAE on validation split: {val_mae_final:.4f}")

# -----------------------------------------
# Evaluate MAE on full training data
# (should be artificially low, but useful to detect underfitting)
# -----------------------------------------

train_pred_full = best_model.predict(X_train)
train_mae_full = mean_absolute_error(y, train_pred_full)

print(f"üìò MAE on full training data: {train_mae_full:.4f}")

# -----------------------------------------
# Make predictions for test.csv
# -----------------------------------------

pred_test = best_model.predict(X_test)

submission = pd.read_csv("Submissions/sample_submission.csv")
submission["Tm"] = pred_test
submission.to_csv("Submissions/submission_rdkit_lgbm.csv", index=False)

print("\n‚úîÔ∏è Saved submission_rdkit_lgbm.csv")

# -----------------------------------------
# Warn user: Test MAE is unknown
# -----------------------------------------

print("\n‚ö†Ô∏è Hinweis:")
print("Der MAE f√ºr die erzeugte submission.csv kann NICHT berechnet werden,")
print("weil Kaggle die echten Testlabels nicht ver√∂ffentlicht.")
print("Nur Kaggle selbst kann die Test-MAE nach Upload auswerten.")

üîÑ Optuna: Trials=54 | Best MAE=30.5479
Final Params: {'lr': 0.09378091789016234, 'leaves': 68, 'ff': 0.8941347612779904, 'bf': 0.8403947088901286, 'bfreq': 1, 'minleaf': 11, 'objective': 'regression_l1', 'metric': 'l1'}
‚úîÔ∏è Saved submission_rdkit_lgbm.csv

üéØ Optuna finished!
Best Validation MAE during tuning: 30.5479
Best Parameters:
  lr: 0.09378091789016234
  leaves: 68
  ff: 0.8941347612779904
  bf: 0.8403947088901286
  bfreq: 1
  minleaf: 11
  objective: regression_l1
  metric: l1

üìä Final model MAE on validation split: 17.2874
üìò MAE on full training data: 16.9838

‚úîÔ∏è Saved submission_rdkit_lgbm.csv

‚ö†Ô∏è Hinweis:
Der MAE f√ºr die erzeugte submission.csv kann NICHT berechnet werden,
weil Kaggle die echten Testlabels nicht ver√∂ffentlicht.
Nur Kaggle selbst kann die Test-MAE nach Upload auswerten.
