In [1]:
import os

is_kaggle_notebook = os.path.exists("/kaggle/input")

# ÂøÖË¶Å„Éë„ÉÉ„Ç±„Éº„Ç∏„Çí„Ç§„É≥„Çπ„Éà„Éº„É´
if is_kaggle_notebook:
    !pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

In [15]:
import os
import warnings
import sys
from pathlib import Path

import lightgbm as lgb
import numpy as np
import pandas as pd
from rdkit import rdBase

from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error


rdBase.DisableLog('rdApp.warning')
warnings.filterwarnings("ignore")

In [None]:
pr_number = 1

In [17]:
if is_kaggle_notebook:
    module_path = f"/kaggle/input/myproject-pr-{pr_number:04}"
    !mkdir src
    !cp -r $module_path/* src/
    src_path = "./"
else:
    src_path = "../"

sys.path.append(src_path)

from src.data import load_data, add_descriptors, add_external_data
from src.utils import score, NULL_FOR_SUBMISSION

In [16]:
# ---------------------------
# LightGBM „É¢„Éá„É´„ÅÆÂ≠¶Áøí
# ---------------------------
def train_lgb_for_target(train, test, target_col, features, n_splits=5):
    print(f"\n=== Training for target: {target_col} ===")

    df_train = train[~train[target_col].isna()]
    df_test = test.copy()

    X = df_train[features]
    y = df_train[target_col]
    X_test = df_test[features]
    
    preds_test = np.zeros(len(X_test))
    oof = np.zeros(len(X))

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        # # Scale features
        # scaler = StandardScaler()
        # X_train = scaler.fit_transform(X_train)
        # X_val = scaler.transform(X_val)
        # X_test = scaler.transform(X_test)
        
        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
        params = {
            "objective": "regression",
            "metric": "mae",
            "verbosity": -1,
            "learning_rate": 0.01,
            "max_depth": 7,
            "seed": 42,
            "subsample": 0.7,
            "colsample_bytree": 0.6,
            "num_boost_round": 2000,
        }

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dval],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(200)
            ]
        )

        oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        preds_test += model.predict(X_test, num_iteration=model.best_iteration) / n_splits

    score_mse = mean_squared_error(y, oof)
    score_mae = mean_absolute_error(y, oof)
    print(f"RMSE for {target_col}: {score_mse:.4f}")
    print(f"MAE for {target_col}: {score_mae:.4f}")
    
    return preds_test, oof, df_train["id"].values

# ---------------------------
# „É°„Ç§„É≥Âá¶ÁêÜ
# ---------------------------
exp = "exp012"
debug = False
n_splits = 5 if not debug else 2

if debug:
    output_path = Path("../tmp") / exp
else:
    output_path = Path("../outputs") / exp

if is_kaggle_notebook:
    # kaggle notebook
    data_dir = Path("/kaggle/input")
    debug = False
else:
    # local
    data_dir = Path("../data/raw")

if is_kaggle_notebook:
    train, test = load_data(data_dir)
    train = add_descriptors(train)
    test = add_descriptors(test)
else:
    train, test = load_data(data_dir)

    if os.path.exists(output_path / "train.csv"):
        train = pd.read_csv(output_path / "train.csv")
    else:
        train = add_descriptors(train)

        os.makedirs(output_path, exist_ok=True)
        train.to_csv(output_path / "train.csv", index=False)
        print("Saved train.csv")

    test = add_descriptors(test)

train = add_external_data(train, data_dir)
train["id"] = np.arange(len(train))
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
submission = pd.read_csv(data_dir / "neurips-open-polymer-prediction-2025/sample_submission.csv")

oof_dfs = []

features = train.drop(targets + ["id", "SMILES"], axis=1).columns
for idx, target in enumerate(targets):
    preds, oof, ids = train_lgb_for_target(train, test, target, features, n_splits)
    submission[target] = preds
        
    oof_df = pd.DataFrame({
        "id": ids,
        target: oof
    })
    oof_dfs.append(oof_df)    

if is_kaggle_notebook:
    submission.to_csv("submission.csv", index=False)
    print("‚úÖ submission saved to submission.csv")
else:
    # CV Ë®àÁÆó
    oof_df = pd.DataFrame()
    oof_df["id"] = train["id"]
    for i_oof in oof_dfs:
        oof_df = oof_df.merge(i_oof, on="id", how="left")
    solution = train[["id"] + targets].copy()
    solution = solution.fillna(NULL_FOR_SUBMISSION)
    oof_df = oof_df.fillna(NULL_FOR_SUBMISSION)
    
    # Ë©ï‰æ°
    final_score = score(
        solution=solution,
        submission=oof_df,
    )
    print(f"\nüìä Final OOF Score (wMAE): {final_score:.6f}")


Generating descriptors: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 52.97it/s]



=== Training for target: Tg ===
Training until validation scores don't improve for 50 rounds
[200]	training's l1: 53.1571	valid_1's l1: 53.4513
[400]	training's l1: 46.7921	valid_1's l1: 48.6349
[600]	training's l1: 44.7979	valid_1's l1: 47.539
[800]	training's l1: 43.7485	valid_1's l1: 47.0547
[1000]	training's l1: 42.9567	valid_1's l1: 46.6756
[1200]	training's l1: 42.2864	valid_1's l1: 46.4024
[1400]	training's l1: 41.7681	valid_1's l1: 46.2104
[1600]	training's l1: 41.2031	valid_1's l1: 46.0342
[1800]	training's l1: 40.7063	valid_1's l1: 45.8924
[2000]	training's l1: 40.2375	valid_1's l1: 45.7657
Did not meet early stopping. Best iteration is:
[2000]	training's l1: 40.2375	valid_1's l1: 45.7657
Training until validation scores don't improve for 50 rounds
[200]	training's l1: 52.1226	valid_1's l1: 57.0829
[400]	training's l1: 45.9493	valid_1's l1: 52.2681
[600]	training's l1: 44.1254	valid_1's l1: 51.1839
[800]	training's l1: 43.1183	valid_1's l1: 50.6913
[1000]	training's l1: 42.3