In [13]:
# ================================================================
# CLEAN ENSEMBLE PIPELINE: RDKit + LGBM + XGB + CAT + STACKING
# ================================================================

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

# ML / CV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge

# Models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

# RDKit
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors


# ================================================================
# 1) DATA LOADING
# ================================================================

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y = train["Tm"].values
smiles_train = train["SMILES"].tolist()
smiles_test  = test["SMILES"].tolist()


# ================================================================
# 2) FEATURE ENGINEERING (RDKit Descriptors + Morgan FP)
# ================================================================

# Get RDKit descriptor functions
all_desc = Descriptors._descList
desc_names = [d[0] for d in all_desc]
desc_funcs = [d[1] for d in all_desc]

NUM_FP = 2048
NUM_DESC = len(desc_names)

def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(NUM_DESC + NUM_FP)

    # RDKit Descriptors
    desc_vals = []
    for f in desc_funcs:
        try:
            v = f(mol)
        except:
            v = 0.0
        if v is None or pd.isna(v):
            v = 0.0
        desc_vals.append(v)

    desc_vals = np.array(desc_vals)

    # Morgan Fingerprint
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=NUM_FP)
    fp = np.array(fp, dtype=float)

    return np.concatenate([desc_vals, fp])


print("ðŸ”„ Computing Featuresâ€¦")
X_train = np.vstack([featurize(s) for s in smiles_train])
X_test  = np.vstack([featurize(s) for s in smiles_test])

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


# ================================================================
# 3) GENERIC CROSS-VALIDATION FUNCTION
# ================================================================

def run_cv(model, X, y, X_test, name, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof = np.zeros(len(y))
    test_preds = []

    print(f"\n========== {name}: {n_splits}-Fold CV ==========")

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        # ----- LightGBM -----
        if isinstance(model, lgb.LGBMRegressor):
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                callbacks=[lgb.log_evaluation(period=0)]
            )

        # ----- XGBoost -----
        elif isinstance(model, xgb.XGBRegressor):
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                verbose=False      # wichtig: XGB akzeptiert verbose
            )

        # ----- CatBoost -----
        elif isinstance(model, CatBoostRegressor):
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                verbose=False
            )

        else:
            raise ValueError("Unknown model type:", model)

        # ----- Predictions -----
        pred = model.predict(X_val)
        fold_mae = mean_absolute_error(y_val, pred)
        print(f"{name} Fold {fold} MAE: {fold_mae:.4f}")

        oof[val_idx] = pred
        test_preds.append(model.predict(X_test))

    # Final CV result
    oof_mae = mean_absolute_error(y, oof)
    test_mean = np.mean(test_preds, axis=0)

    print(f">>> {name} OOF MAE: {oof_mae:.4f}\n")
    return oof, test_mean, oof_mae


# ================================================================
# 4) MODEL DEFINITIONS
# ================================================================

def build_lgbm():
    return lgb.LGBMRegressor(
        objective="regression_l1",
        metric="l1",
        learning_rate=0.03,
        num_leaves=48,
        feature_fraction=0.85,
        bagging_fraction=0.8,
        bagging_freq=3,
        min_data_in_leaf=50,
        lambda_l1=1.0,
        lambda_l2=1.0,
        n_estimators=2000,
        verbosity=-1,
    )

def build_xgb():
    return xgb.XGBRegressor(
        objective="reg:squarederror",
        learning_rate=0.03,
        max_depth=7,
        subsample=0.85,
        colsample_bytree=0.85,
        n_estimators=1500,
        reg_alpha=0.3,
        reg_lambda=1.0,
        tree_method="hist",
        gamma=0.0,
    )

def build_cat():
    return CatBoostRegressor(
        loss_function="MAE",
        learning_rate=0.03,
        depth=8,
        l2_leaf_reg=5.0,
        iterations=1500,
        verbose=False,
        random_seed=42
    )


# ================================================================
# 5) TRAIN INDIVIDUAL MODELS
# ================================================================

oof_lgb, test_lgb, mae_lgb = run_cv(build_lgbm(), X_train, y, X_test, "LightGBM")
oof_xgb, test_xgb, mae_xgb = run_cv(build_xgb(), X_train, y, X_test, "XGBoost")
oof_cat, test_cat, mae_cat = run_cv(build_cat(), X_train, y, X_test, "CatBoost")


# ================================================================
# 6) ENSEMBLES
# ================================================================

# Simple average ensemble
oof_ens_simple = (oof_lgb + oof_xgb + oof_cat) / 3
test_ens_simple = (test_lgb + test_xgb + test_cat) / 3
mae_ens_simple = mean_absolute_error(y, oof_ens_simple)

print("Simple Ensemble MAE:", mae_ens_simple)

# Weighted ensemble
oof_ens_weighted = 0.4 * oof_lgb + 0.3 * oof_xgb + 0.3 * oof_cat
test_ens_weighted = 0.4 * test_lgb + 0.3 * test_xgb + 0.3 * test_cat
mae_ens_weighted = mean_absolute_error(y, oof_ens_weighted)

print("Weighted Ensemble MAE:", mae_ens_weighted)


# ================================================================
# 7) STACKING (LEVEL-2 RIDGE)
# ================================================================

X_stack = np.vstack([oof_lgb, oof_xgb, oof_cat]).T
X_stack_test = np.vstack([test_lgb, test_xgb, test_cat]).T

stacker = Ridge(alpha=0.1)
stacker.fit(X_stack, y)

oof_stack = stacker.predict(X_stack)
test_stack = stacker.predict(X_stack_test)

mae_stack = mean_absolute_error(y, oof_stack)
print("\nStacking (Ridge) MAE:", mae_stack)


# ================================================================
# 8) EXPORT SUBMISSIONS
# ================================================================

sample = pd.read_csv("Submissions/sample_submission.csv")

# Individual models
sample.assign(Tm=test_lgb) .to_csv("Submissions/submission_lgbm_rdkit.csv", index=False)
sample.assign(Tm=test_xgb) .to_csv("Submissions/submission_xgb_rdkit.csv", index=False)
sample.assign(Tm=test_cat) .to_csv("Submissions/submission_cat_rdkit.csv", index=False)

# Ensembles
sample.assign(Tm=test_ens_simple)   .to_csv("Submissions/submission_ens_simple_rdkit.csv", index=False)
sample.assign(Tm=test_ens_weighted) .to_csv("Submissions/submission_ens_weighted_rdkit.csv", index=False)

# Stacking
sample.assign(Tm=test_stack) .to_csv("Submissions/submission_stack_ridge.csv", index=False)

print("\nâœ” Alle Submission-Dateien erfolgreich exportiert!")

ðŸ”„ Computing Featuresâ€¦
Train shape: (2662, 2256) Test shape: (666, 2256)

LightGBM Fold 1 MAE: 30.9597
LightGBM Fold 2 MAE: 28.6413
LightGBM Fold 3 MAE: 28.0686
LightGBM Fold 4 MAE: 29.4618
LightGBM Fold 5 MAE: 29.3733
>>> LightGBM OOF MAE: 29.3013


XGBoost Fold 1 MAE: 28.1343
XGBoost Fold 2 MAE: 26.6812
XGBoost Fold 3 MAE: 27.5682
XGBoost Fold 4 MAE: 28.0558
XGBoost Fold 5 MAE: 26.3206
>>> XGBoost OOF MAE: 27.3521


CatBoost Fold 1 MAE: 28.7246
CatBoost Fold 2 MAE: 27.6681
CatBoost Fold 3 MAE: 27.5868
CatBoost Fold 4 MAE: 28.8916
CatBoost Fold 5 MAE: 27.1770
>>> CatBoost OOF MAE: 28.0097

Simple Ensemble MAE: 27.56120000944658
Weighted Ensemble MAE: 27.680900190589977

Stacking (Ridge) MAE: 27.243908942831457

âœ” Alle Submission-Dateien erfolgreich exportiert!


In [15]:
# ================================================================
# XGBOOST â€“ CLEAN STANDALONE PIPELINE FOR MELTING-POINT PREDICTION
# ================================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# XGBoost
import xgboost as xgb

# RDKit
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors


# ================================================================
# 1) LOAD DATA
# ================================================================

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

y = train["Tm"].values
smiles_train = train["SMILES"].tolist()
smiles_test  = test["SMILES"].tolist()


# ================================================================
# 2) FEATURE ENGINEERING
# ================================================================

# RDKit descriptors
all_desc = Descriptors._descList
desc_names = [d[0] for d in all_desc]
desc_funcs = [d[1] for d in all_desc]

NUM_DESC = len(desc_names)
NUM_FP   = 2048  # Morgan fingerprint size


def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(NUM_DESC + NUM_FP)

    # RDKit descriptors
    desc_values = []
    for f in desc_funcs:
        try:
            v = f(mol)
        except:
            v = 0.0
        if v is None or pd.isna(v):
            v = 0.0
        desc_values.append(v)

    desc_values = np.array(desc_values)

    # Morgan fingerprint
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=NUM_FP)
    fp = np.array(fp, dtype=float)

    return np.concatenate([desc_values, fp])


print("ðŸ”„ Computing Features...")
X_train = np.vstack([featurize(s) for s in smiles_train])
X_test  = np.vstack([featurize(s) for s in smiles_test])

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


# ================================================================
# 3) XGBOOST MODEL
# ================================================================

def build_xgb():
    return xgb.XGBRegressor(
        objective="reg:squarederror",
        learning_rate=0.02,
        max_depth=6,
        n_estimators=8000,
        subsample=0.7,
        colsample_bytree=0.6,
        min_child_weight=5,
        reg_alpha=1.0,
        reg_lambda=2.0,
        tree_method="hist",
        gamma=0.0,
    )

#### RESULTS: 
# default params: 
# objective="reg:squarederror", learning_rate=0.03, max_depth=6, 
# subsample=0.85, colsample_bytree=0.85, n_estimators=8000, reg_alpha=0.3, reg_lambda=1.0,
# tree_method="hist", gamma=0.0,

# TEST1: max_depth = 7 & n_estimators = 1500 -> MAE: 27.3521
# TEST2: max_depth = 6 & n_estimators = 8000 -> MAE: 26.9902
# TEST3: max_depth = 6 & n_estimators = 8000 & gamma=1.0 -> MAE: 26.9620 (but in Kaggle worse than TEST2)
# TEST4: learning_rate=0.02 & max_depth = 6 & n_estimators = 8000 & reg_alpha=1.0, & reg_lambda=2.0 -> MAE: 



# ================================================================
# 4) CROSS-VALIDATION
# ================================================================

def run_cv_xgb(X, y, X_test, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof = np.zeros(len(y))
    test_folds = []

    print(f"\n========== XGBOOST {n_splits}-Fold CV ==========")

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
        print(f"\n--- Fold {fold}/{n_splits} ---")

        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        model = build_xgb()
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)

        pred_val = model.predict(X_val)
        fold_mae = mean_absolute_error(y_val, pred_val)

        print(f"Fold MAE: {fold_mae:.4f}")

        oof[val_idx] = pred_val
        test_folds.append(model.predict(X_test))

    oof_mae = mean_absolute_error(y, oof)
    test_pred = np.mean(test_folds, axis=0)

    print(f"\n>>> FINAL XGB OOF MAE: {oof_mae:.4f}")

    return oof, test_pred, oof_mae


# Run CV
oof_xgb, test_xgb, mae_xgb = run_cv_xgb(X_train, y, X_test)


# ================================================================
# 5) EXPORT SUBMISSION
# ================================================================

sample = pd.read_csv("Submissions/sample_submission.csv")
sample["Tm"] = test_xgb
sample.to_csv("Submissions/submission_xgb_rdkit.csv", index=False)

print("\nâœ” Saved: Submissions/submission_xgb_rdkit.csv")

ðŸ”„ Computing Features...
Train shape: (2662, 2256) Test shape: (666, 2256)


--- Fold 1/5 ---
Fold MAE: 27.5113

--- Fold 2/5 ---
Fold MAE: 26.8771

--- Fold 3/5 ---
Fold MAE: 27.4852

--- Fold 4/5 ---
Fold MAE: 27.2596

--- Fold 5/5 ---
Fold MAE: 26.1409

>>> FINAL XGB OOF MAE: 27.0549

âœ” Saved: Submissions/submission_xgb_rdkit.csv


In [16]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

# =====================================================
# 1) Load Data
# =====================================================

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

smiles_train = train["SMILES"].tolist()
smiles_test  = test["SMILES"].tolist()

print("Train rows:", len(train))
print("Test rows :", len(test))


# =====================================================
# 2) Build Feature Extractor
# =====================================================

# List of RDKit descriptor functions
all_desc = Descriptors._descList
desc_names = [d[0] for d in all_desc]
desc_funcs = [d[1] for d in all_desc]

def featurize_smiles(smi):
    """
    Converts a SMILES string into:
    - RDKit descriptors
    - Morgan fingerprint (2048 bits)
    """

    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        # Molecule invalid â†’ return zero vector
        return np.zeros(len(desc_names) + 2048, dtype=float)

    # --- RDKit Descriptors ---
    desc_values = []
    for func in desc_funcs:
        try:
            val = func(mol)
        except:
            val = np.nan
        desc_values.append(val)

    desc_values = np.nan_to_num(np.array(desc_values, dtype=float))

    # --- Morgan Fingerprint ---
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    fp = np.array(fp, dtype=float)

    return np.concatenate([desc_values, fp])


# =====================================================
# 3) Compute Features
# =====================================================

print("Computing RDKit features... (this can take a few minutes)")
X_train = np.vstack([featurize_smiles(s) for s in smiles_train])
X_test  = np.vstack([featurize_smiles(s) for s in smiles_test])

print("Shapes:")
print(" â†’ Train features:", X_train.shape)
print(" â†’ Test features :", X_test.shape)


# =====================================================
# 4) Build Feature DataFrames
# =====================================================

# Column names
fp_names = [f"fp_{i}" for i in range(2048)]
feature_columns = desc_names + fp_names

# Training DF with Tm
train_features = pd.DataFrame(X_train, columns=feature_columns)
train_features["Tm"] = train["Tm"].values   # Add target

# Test DF with id
test_features = pd.DataFrame(X_test, columns=feature_columns)
test_features["id"] = test["id"].values


# =====================================================
# 5) Save Feature Files
# =====================================================

train_features.to_csv("train_features.csv", index=False)
test_features.to_csv("test_features.csv", index=False)

print("\nâœ“ Saved train_features.csv")
print("âœ“ Saved test_features.csv")
print("Done!")


Train rows: 2662
Test rows : 666
Computing RDKit features... (this can take a few minutes)
Shapes:
 â†’ Train features: (2662, 2256)
 â†’ Test features : (666, 2256)

âœ“ Saved train_features.csv
âœ“ Saved test_features.csv
Done!
