- -> exp041
- morgan fingerprint „ÅÆÂÆüË£Ö„Éê„Ç∞„Å£„Å¶„Åü„Åã„ÇÇ

In [1]:
# %load_ext autotime
%load_ext autoreload

In [2]:
import os

is_kaggle_notebook = os.path.exists("/kaggle/input")

# ÂøÖË¶Å„Éë„ÉÉ„Ç±„Éº„Ç∏„Çí„Ç§„É≥„Çπ„Éà„Éº„É´
if is_kaggle_notebook:
    !pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
    !pip install /kaggle/input/torch-geometric-2-6-1/torch_geometric-2.6.1-py3-none-any.whl

In [3]:
import copy
import json
import os
import sys
import warnings
from pathlib import Path

import wandb

import lightgbm as lgb
import numpy as np
import pandas as pd
import torch
from rdkit import rdBase
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GroupKFold, KFold
from tqdm.auto import tqdm

rdBase.DisableLog("rdApp.warning")
warnings.filterwarnings("ignore")

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mko_ya346[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
pr_number = 1
debug = False

In [5]:
%autoreload 2

if is_kaggle_notebook:
    module_path = f"/kaggle/input/myproject-pr-{pr_number:04}"
    !mkdir src
    !cp -r $module_path/* src/
    src_path = "./"
else:
    src_path = "../"

sys.path.append(src_path)

from src.data import load_data, add_descriptors, add_external_data, make_smile_canonical, add_maccs, add_augumented_gmm, add_graph_features, add_count_atoms
from src.model import train_lgb_for_target, save_lgb_model
from src.utils import NULL_FOR_SUBMISSION, generate_scaffold, score, add_scaffold_kfold, scaffold_cv_split, get_useless_cols
from src.utils.upload_kaggle_dataset import (
    create_kaggle_dataset_metadata,
    upload_kaggle_dataset,
)

In [6]:
exp = "exp042"
notes = "morgan fingerprint „ÅÆÂÆüË£Ö‰øÆÊ≠£"
model_name = "lgb"

config = {
    "debug": debug,
    "n_splits": 5,
    "num_epochs": 1000,
    "batch_size": 128,
    "drop_ratio": 0.5,
    "force_update_train": False,
    "augumented_gmm": False,
    "is_complement": True
}

dataset_title = f"model-{exp}"
dataset_id = f"koya346/{dataset_title}"

if is_kaggle_notebook:
    config["debug"] = False

if config["debug"]:
    config["n_splits"] = 2
    config["num_epochs"] = 10

targets = ["Tg", "FFV", "Tc", "Density", "Rg"]

# TODO: Â≠¶Áøí„Éë„É©„É°„Éº„ÇøÂÆöÁæ©
params = {
    "objective": "regression",
    "metric": "mae",
    "verbosity": -1,
    "learning_rate": 0.01,
    "max_depth": 7,
    "seed": 42,
    "subsample": 0.7,
    "colsample_bytree": 0.6,
    "num_boost_round": 20000,
}

config.update(params)
pre_params = copy.deepcopy(params)
pre_params["num_boost_round"] = 300


In [7]:
wandb_name = f"{exp}_{model_name}" if not config["debug"] else f"{exp}_{model_name}_debug"
wandb.init(project="opp2025", name=wandb_name, config=config)
wandb.log({"Notes": notes})

# ---------------------------
# „É°„Ç§„É≥Âá¶ÁêÜ
# ---------------------------
if config["debug"]:
    output_path = Path("/home/kouya-takahashi/kaggle/opp2025/outputs") / exp / "debug"
else:
    output_path = Path("/home/kouya-takahashi/kaggle/opp2025/outputs") / exp

model_output_path = output_path / "model_cv"
if not os.path.exists(model_output_path):
    os.makedirs(model_output_path)

if is_kaggle_notebook:
    # kaggle notebook
    data_dir = Path("/kaggle/input")
else:
    # local
    data_dir = Path("/home/kouya-takahashi/kaggle/opp2025/data/raw")

# Â≠¶Áøí„Éá„Éº„ÇøÁî®ÊÑè

if os.path.exists(output_path / "train.csv") and not config["force_update_train"]:
    train = pd.read_csv(output_path / "train.csv")
else:
    train, _ = load_data(data_dir)
    train["SMILES"] = train["SMILES"].apply(make_smile_canonical)

    if config["debug"]:
        # ÂêÑ„Çø„Éº„Ç≤„ÉÉ„Éà„ÅåÊ¨†Êêç„Åó„Å¶„ÅÑ„Å™„ÅÑ„Éá„Éº„Çø„Çí30 ‰ª∂„Åö„Å§Âèñ„ÇäÂá∫„Åô
        tmp_dfs = []
        for target in targets:
            cond = train[target].notnull()
            tmp_dfs.append(train[cond].iloc[:30])
        train = pd.concat(tmp_dfs).reset_index(drop=True)
    else:
        print(train.shape)
        external_data_dict = [
            {
                "ex_path": data_dir / "neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv",
                "col": "Tg",
            },
            {
                "ex_path": data_dir / "neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv",
                "col": "Tc",
                "rename_d": {"TC_mean": "Tc"},
            },
            {
                "ex_path": data_dir / "neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv",
                "col": "FFV",
            },
            {
                "ex_path": data_dir / "tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv",
                "col": "Tg",
            },
            # {
            #     "ex_path": data_dir / "smiles-extra-data/data_dnst1.xlsx",
            #     "col": "Density",
            #     "rename_d": {"density(g/cm3)": "Density"}, 
            # },
            {
                "ex_path": data_dir / "smiles-extra-data/data_tg3.xlsx",
                "col": "Tg",
                "rename_d": {"Tg [K]": "Tg"}, 
            },
            {
                "ex_path": data_dir / "smiles-extra-data/JCIM_sup_bigsmiles.csv",
                "col": "Tg",
                "rename_d": {"Tg (C)": "Tg"}, 
            },
        ]
        for d in external_data_dict:
            print(f"ex_path: {str(d['ex_path'])}")
            train = add_external_data(
                df=train,
                ex_path=d.get("ex_path"),
                col=d.get("col"),
                rename_d=d.get("rename_d"),
                is_complement=config["is_complement"]
            )
            print("after train.shape: ", train.shape)

    train = add_maccs(train)

    # rdkit „ÅÆË®òËø∞Â≠ê, morgan finger print
    train = add_descriptors(train, radius=2, fp_size=1024)

    new_cols = []
    seen = {}
    for col in train.columns:
        if col in seen:
            seen[col] += 1
            new_cols.append(f"{col}_{seen[col]}")
        else:
            seen[col] = 0
            new_cols.append(col)
    
    train.columns = new_cols
    
    # id „ÅåÊ¨†Êêç„Åó„Å¶„ÅÑ„Çã -> ËøΩÂä†„Éá„Éº„Çø
    train["is_external"] = train["id"].isnull()
    
    # „Ç∞„É©„ÉïÁâπÂæ¥Èáè
    train = add_graph_features(train)
    train = add_count_atoms(train)
    
    train["id"] = np.arange(len(train))
    features = train.drop(targets + ["id", "SMILES"], axis=1).columns
    for col in features:
        if train[col].dtype == "object":
            train[col] = pd.to_numeric(train[col], errors="coerce")
    useless_cols = get_useless_cols(train.drop(targets + ["id", "SMILES"], axis=1))
    
    train = train.drop(useless_cols, axis=1)
    
    os.makedirs(output_path, exist_ok=True)
    train.to_csv(output_path / "train.csv", index=False)
    print("Saved train.csv")

features = train.drop(targets + ["id", "SMILES"], axis=1).columns
print(len(features))

oof_dfs = []

loss_table_wandb = wandb.Table(["exp", "model_name", "fold", "target", "mae", "mse"])
all_loss_tables = []
mae_dict = {}
all_models = {}

for idx, target_col in enumerate(targets):
    loss_tables = []
    print(f"\n=== Training for target: {target_col} ===")

    df_train = train[train[target_col].notnull()].reset_index(drop=True)
    df_train = add_scaffold_kfold(df_train, n_splits=config["n_splits"])
    X = df_train[features]
    y = df_train[target_col]
    oof = np.zeros(len(X))
    
    models = []

    for fold, tr_idx, val_idx in scaffold_cv_split(df_train, n_splits=config["n_splits"]):
        loss_table = {}
        print(f"fold: {fold + 1}")
        if target_col in targets and not config["debug"]:
            X_train_pre = X.iloc[tr_idx]
            y_train_pre = y.iloc[tr_idx]
    
            dtrain_pre = lgb.Dataset(X_train_pre, label=y_train_pre)
    
            # valid „Éá„Éº„Çø„Çí‰Ωø„Çè„Åö„Å´Â≠¶Áøí
            pre_model = lgb.train(
                pre_params,
                dtrain_pre,
            )
    
            # ÂØÑ‰∏éÂ∫¶„Åå 0 „Çà„ÇäÂ§ß„Åç„ÅÑÁâπÂæ¥Èáè„ÇíÂèñ„ÇäÂá∫„Åô
            feature_importance = pre_model.feature_importance()
            print(np.sum(feature_importance == 0) / len(feature_importance))
            
            feature_name = pre_model.feature_name()
            use_features = [feature_name[idx] for idx in range(len(feature_name)) if feature_importance[idx] > 0]
        else:
            use_features = features
        print(len(use_features))        

        # ÁâπÂæ¥ÈáèÈÅ∏Êäû„Åó„Å¶ valid „Éá„Éº„Çø„Å®„Å®„ÇÇ„Å´Â≠¶Áøí
        X_train, X_val = X.iloc[tr_idx][use_features], X.iloc[val_idx][use_features]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        if config["augumented_gmm"]:
            X_train, y_train = add_augumented_gmm(X_train, y_train)    
        
        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dval],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(200)
            ]
        )

        save_lgb_model(model, str(model_output_path / f"model_{target_col}_{fold}.txt"))

        pred = model.predict(X_val, num_iteration=model.best_iteration)
        oof[val_idx] = pred

        mse = mean_squared_error(y_val, pred)
        mae = mean_absolute_error(y_val, pred)
        loss_table["fold"] = fold
        loss_table["target"] = target_col
        loss_table["mae"] = mae
        loss_table["mse"] = mse

        loss_tables.append(loss_table)
        models.append(model)

    score_mse = mean_squared_error(y, oof)
    score_mae = mean_absolute_error(y, oof)
    print(f"RMSE for {target_col}: {score_mse:.4f}")
    print(f"MAE for {target_col}: {score_mae:.4f}")
    mae_dict[target_col] = score_mae

    for loss_table in loss_tables:
        loss_table_wandb.add_data(exp, model_name, loss_table["fold"], loss_table["target"], loss_table["mae"], loss_table["mse"])
    all_loss_tables += loss_tables

    oof_df = pd.DataFrame({
        "id": df_train["id"].values,
        target_col: oof
    })
    oof_dfs.append(oof_df)   

    all_models[target_col] = models

wandb.log({"fold_target_losses": loss_table_wandb})
# target ÊØé„ÅÆ Âπ≥Âùá mae „ÇíË®òÈå≤
for target in targets:
    key_name = f"{target}_mean_mae"
    mae_values = mae_dict[target]
    # mae_values = [d["mae"] for d in all_loss_tables if d["target"] == target]
    wandb.log({key_name: np.mean(mae_values)})

 
# CV Ë®àÁÆó
oof_df = pd.DataFrame()
oof_df["id"] = train["id"]
for i_oof in oof_dfs:
    oof_df = oof_df.merge(i_oof, on="id", how="left")
oof_df.to_csv(output_path / "oof.csv", index=False)

solution = train[["id"] + targets].copy()
solution = solution.fillna(NULL_FOR_SUBMISSION)
oof_df = oof_df.fillna(NULL_FOR_SUBMISSION)

# Ë©ï‰æ°
final_score = score(
    solution=solution,
    submission=oof_df,
)
print(f"\nüìä Final OOF Score (wMAE): {final_score:.6f}")
wandb.log({"wMAE": final_score})

# target ÊØé„ÅÆ best_iteration „Çí‰øùÂ≠ò„Åô„Çã„ÄÇ‰øùÂ≠ò„Åó„Åü„É¢„Éá„É´„Å´„ÅØË®òÈå≤„Åï„Çå„Å¶„Å™„Åã„Å£„Åü
best_iterations = {}
for target in targets:
    target_best_iterations = [model.best_iteration for model in all_models[target]]
    best_iterations[target] = target_best_iterations
print(best_iterations)

with open(output_path / "best_iterations.json", "w") as f:
    json.dump(best_iterations, f)

(7973, 7)
ex_path: /home/kouya-takahashi/kaggle/opp2025/data/raw/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv
after train.shape:  (8019, 7)
ex_path: /home/kouya-takahashi/kaggle/opp2025/data/raw/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
after train.shape:  (8148, 7)
ex_path: /home/kouya-takahashi/kaggle/opp2025/data/raw/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
after train.shape:  (8972, 7)
ex_path: /home/kouya-takahashi/kaggle/opp2025/data/raw/tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv
after train.shape:  (10343, 7)
ex_path: /home/kouya-takahashi/kaggle/opp2025/data/raw/smiles-extra-data/data_tg3.xlsx
after train.shape:  (10842, 7)
ex_path: /home/kouya-takahashi/kaggle/opp2025/data/raw/smiles-extra-data/JCIM_sup_bigsmiles.csv
after train.shape:  (10854, 7)


Generating maccs:   0%|          | 0/10854 [00:00<?, ?it/s]

Generating descriptors:   0%|          | 0/10854 [00:00<?, ?it/s]

Unique=1 col: ['maccs_0', 'maccs_1', 'maccs_2', 'maccs_4', 'maccs_5', 'maccs_6', 'maccs_7', 'maccs_9', 'maccs_166', 'SMR_VSA8', 'SlogP_VSA9', 'fr_isothiocyan', 'fr_nitroso', 'fr_prisulfonamd', 'fr_thiocyan']
Highly correlated cols:  ['maccs_40', 'maccs_55', 'maccs_56', 'maccs_58', 'maccs_60', 'maccs_61', 'maccs_63', 'maccs_64', 'maccs_71', 'maccs_105', 'maccs_117', 'maccs_134', 'maccs_135', 'maccs_143', 'maccs_147', 'MaxAbsEStateIndex', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'FpDensityMorgan2', 'FpDensityMorgan3', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Kappa1', 'LabuteASA', 'SlogP_VSA6', 'VSA_EState6', 'HeavyAtomCount', 'NOCount', 'NumAromaticCarbocycles', 'NumAromaticRings', 'NumHDonors', 'MolMR', 'fr_Al_OH_noTert', 'fr_COO2', 'fr_C_O_noCOO', 'fr_Nhpyrrole', 'fr_benzene', 'fr_diazo', 'fr_halogen', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_pho