- -> exp048
- 主鎖、置換基の情報

In [16]:
# %load_ext autotime
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
import os

is_kaggle_notebook = os.path.exists("/kaggle/input")

# 必要パッケージをインストール
if is_kaggle_notebook:
    !pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
    !pip install /kaggle/input/torch-geometric-2-6-1/torch_geometric-2.6.1-py3-none-any.whl

In [18]:
import copy
import json
import os
import sys
import warnings
from pathlib import Path

import wandb

import lightgbm as lgb
import numpy as np
import pandas as pd
import torch
from rdkit import rdBase
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, ShuffleSplit
from tqdm.auto import tqdm

from sklearn.decomposition import TruncatedSVD
from scipy import sparse

rdBase.DisableLog("rdApp.warning")
warnings.filterwarnings("ignore")

wandb.login()

True

In [19]:
pr_number = 1
debug = True
exp = "exp050"
data_exp = "exp050"
notes = "主鎖のみの情報を追加"
select_feature = True

In [20]:
%autoreload 2

if is_kaggle_notebook:
    module_path = f"/kaggle/input/myproject-pr-{pr_number:04}"
    !mkdir src
    !cp -r $module_path/* src/
    src_path = "./"
else:
    src_path = "../"

sys.path.append(src_path)

from src.data import load_data, add_descriptors, add_descriptors_mordred, add_external_data, make_smile_canonical, add_maccs, add_augumented_gmm, add_graph_features, add_count_atoms
from src.model import train_lgb_for_target, save_lgb_model
from src.utils import NULL_FOR_SUBMISSION, generate_scaffold, score, add_scaffold_kfold, scaffold_cv_split, get_useless_cols, extract_main_chain_smiles_from_star
from src.utils.upload_kaggle_dataset import (
    create_kaggle_dataset_metadata,
    upload_kaggle_dataset,
)

In [21]:
model_name = "lgb"

config = {
    "debug": debug,
    "n_splits": 3,
    "num_epochs": 1000,
    "batch_size": 128,
    "drop_ratio": 0.5,
    "force_update_train": True,
    "augumented_gmm": False,
    "is_complement": True,
    "remove_external_cv": True,
}

dataset_title = f"model-{exp}"
dataset_id = f"koya346/{dataset_title}"

if is_kaggle_notebook:
    config["debug"] = False

if config["debug"]:
    config["n_splits"] = 2
    config["num_epochs"] = 10

targets = ["Tg", "FFV", "Tc", "Density", "Rg"]
org_target_cols = [f"org_{target}" for target in targets]        

# TODO: 学習パラメータ定義
params = {
    "objective": "regression",
    "metric": "mae",
    "verbosity": -1,
    "learning_rate": 0.01,
    "max_depth": 7,
    "seed": 42,
    "subsample": 0.7,
    "colsample_bytree": 0.6,
    "num_boost_round": 20000,
}

config.update(params)
pre_params = copy.deepcopy(params)
pre_params["num_boost_round"] = 300


In [52]:
wandb_name = f"{exp}_{model_name}" if not config["debug"] else f"{exp}_{model_name}_debug"
wandb.init(project="opp2025", name=wandb_name, config=config)
wandb.log({"Notes": notes})

# ---------------------------
# メイン処理
# ---------------------------
if config["debug"]:
    output_path = Path("/home/kouya-takahashi/kaggle/opp2025/outputs") / exp / "debug"
else:
    output_path = Path("/home/kouya-takahashi/kaggle/opp2025/outputs") / exp

model_output_path = output_path / "model_cv"
if not os.path.exists(model_output_path):
    os.makedirs(model_output_path)

if is_kaggle_notebook:
    # kaggle notebook
    data_dir = Path("/kaggle/input")
else:
    # local
    data_dir = Path("/home/kouya-takahashi/kaggle/opp2025/data/raw")


train_path = Path("/home/kouya-takahashi/kaggle/opp2025/outputs") / data_exp
# 学習データ用意

if os.path.exists(train_path / "train.csv") and not config["force_update_train"]:
    train = pd.read_csv(train_path / "train.csv")
else:
    train, _ = load_data(data_dir)
    # 外部データ判定用に元の目的変数を保持しておく
    # 外部データ判定は scaffold_cv_split 内で行う
    for target in targets:
        train[f"org_{target}"] = train[target]
    

    train["org_SMILES"] = train["SMILES"]
    train["SMILES"] = train["SMILES"].apply(make_smile_canonical)
    if config["debug"]:
        # 各ターゲットが欠損していないデータを30 件ずつ取り出す
        tmp_dfs = []
        for target in targets:
            cond = train[target].notnull()
            tmp_dfs.append(train[cond].iloc[:30])
        train = pd.concat(tmp_dfs).reset_index(drop=True)
    else:
        print(train.shape)
        external_data_dict = [
            {
                "ex_path": data_dir / "neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv",
                "col": "Tg",
            },
            {
                "ex_path": data_dir / "neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv",
                "col": "Tc",
                "rename_d": {"TC_mean": "Tc"},
            },
            {
                "ex_path": data_dir / "neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv",
                "col": "FFV",
            },
            {
                "ex_path": data_dir / "tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv",
                "col": "Tg",
            },
            {
                "ex_path": data_dir / "smiles-extra-data/data_dnst1.xlsx",
                "col": "Density",
                "rename_d": {"density(g/cm3)": "Density"}, 
            },
            {
                "ex_path": data_dir / "smiles-extra-data/data_tg3.xlsx",
                "col": "Tg",
                "rename_d": {"Tg [K]": "Tg"}, 
            },
            {
                "ex_path": data_dir / "smiles-extra-data/JCIM_sup_bigsmiles.csv",
                "col": "Tg",
                "rename_d": {"Tg (C)": "Tg"}, 
            },
        ]
        for d in external_data_dict:
            print(f"ex_path: {str(d['ex_path'])}")
            train = add_external_data(
                df=train,
                ex_path=d.get("ex_path"),
                col=d.get("col"),
                rename_d=d.get("rename_d"),
                is_complement=config["is_complement"]
            )
            print("after train.shape: ", train.shape)

    train["is_external"] = train["id"].isnull()
    train["id"] = np.arange(len(train))
    
    main_smiles = []
    cnt_error = 0
    for smiles in train["SMILES"].values:
        try:
            main_smiles.append(extract_main_chain_smiles_from_star(smiles))
        except Exception as e:
            # print(f"{smiles}: {e}")
            cnt_error += 1
            main_smiles.append(smiles)
    print(f"cnt error extract main chain: {cnt_error}")
    
    train["main_SMILES"] = main_smiles
    
    # SMILES, main_SMILESそれぞれで特徴量を生成する
    for smiles_col in ["SMILES", "main_SMILES"]:
        df = add_maccs(train[["id", smiles_col]], col=smiles_col)
        
        # rdkit の記述子, morgan finger print
        df = add_descriptors(df, radius=2, fp_size=1024, col=smiles_col)
        df = add_descriptors_mordred(df, num_confs=10, ignore_3D=True, ignore_3d_stats=True, col=smiles_col)
        
        new_cols = []
        seen = {}
        for col in df.columns:
            if col in seen:
                seen[col] += 1
                new_cols.append(f"{col}_{seen[col]}")
            else:
                seen[col] = 0
                new_cols.append(col)
        
        df.columns = new_cols
        
        # グラフ特徴量
        df = add_graph_features(df, col=smiles_col)
        df = add_count_atoms(df, col=smiles_col)
        
        train = train.merge(df.drop([smiles_col], axis=1), how="left", on="id", suffixes=("", f"_{smiles_col}"))
        print(train.shape)
    
    features = train.drop(targets + org_target_cols + ["id", "org_SMILES", "SMILES", "is_external"], axis=1).columns
    for col in features:
        if train[col].dtype == "object":
            train[col] = pd.to_numeric(train[col], errors="coerce")
    useless_cols = get_useless_cols(train.drop(targets + org_target_cols + ["id", "org_SMILES", "SMILES", "is_external"], axis=1))
    
    train = train.drop(useless_cols, axis=1)
    
    os.makedirs(output_path, exist_ok=True)
    train.to_csv(output_path / "train.csv", index=False)
    print("Saved train.csv")

# 不要なビット列を除去
bit_cols = []
remove_cols = []

for col in train.drop(targets + org_target_cols + ["id", "org_SMILES", "SMILES", "is_external"], axis=1).columns:
    if len(train[col].unique()) != 2:
        continue
    if np.all(train[col].unique() == np.array([0, 1])):
        bit_cols.append(col)
        p = train[col].mean()
        if p > 0.01 and p < 0.99:
            continue
        remove_cols.append(col)
print("bit cols: ", len(bit_cols))
print("remove cols: ", len(remove_cols))

train = train.drop(remove_cols, axis=1)

# 計算済の fold を突合
folds = pd.read_csv("/home/kouya-takahashi/kaggle/opp2025/data/preprocess/fold/folds.csv")
train = train.merge(folds[["SMILES", "fold"]], how="left", on="SMILES")

features = train.drop(targets + org_target_cols + ["id", "org_SMILES", "SMILES", "fold", "is_external"], axis=1).columns
print(len(features))
oof_dfs = []

loss_table_wandb = wandb.Table(["exp", "model_name", "fold", "target", "mae", "mse"])
all_loss_tables = []
mae_dict = {}
all_models = {}

for idx, target_col in enumerate(targets):
    loss_tables = []
    print(f"\n=== Training for target: {target_col} ===")

    df_train = train[train[target_col].notnull()].reset_index(drop=True)
    X = df_train[features]
    y = df_train[target_col]
    oof = np.full(len(X), np.nan, dtype=float)
    
    models = []

    for fold, tr_idx, val_idx in scaffold_cv_split(df_train, target=target_col, n_splits=config["n_splits"], remove_external=config["remove_external_cv"]):
        if len(tr_idx) == 0 or len(val_idx) == 0:
            print(f"Skip fold... tr_idx: {len(tr_idx)}, val_idx: {len(val_idx)}")
            continue
        loss_table = {}
        print(f"fold: {fold + 1}")

        if select_feature:
            hits = np.zeros(X.shape[1], dtype=int)
    
            rs = ShuffleSplit(n_splits=3, train_size=0.9, random_state=42)
            for idx, (tr_in, va_in) in enumerate(rs.split(tr_idx)):
                X_tr_in, X_va_in = X.iloc[tr_in][features], X.iloc[va_in][features]
                y_tr_in, y_va_in = y.iloc[tr_in], y.iloc[va_in]
                dtrain_in = lgb.Dataset(X_tr_in, label=y_tr_in)
                dvalid_in = lgb.Dataset(X_va_in, label=y_va_in)
                model = lgb.train(
                    params,
                    dtrain_in,
                    valid_sets=[dtrain_in, dvalid_in],
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=50),
                        lgb.log_evaluation(200)
                    ]
                )
                rank = model.feature_importance("gain").argsort()[::-1]
                topk = set(rank[:int(0.3 * len(rank))])
                hits[list(topk)] += 1
            keep = hits >= 2
            use_features = features[keep]
        else:
            use_features = features
        print(len(use_features))        

        # 特徴量選択して valid データとともに学習
        X_train, X_val = X.iloc[tr_idx][use_features], X.iloc[val_idx][use_features]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        if config["augumented_gmm"]:
            X_train, y_train = add_augumented_gmm(X_train, y_train)    
        
        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dval],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(200)
            ]
        )

        save_lgb_model(model, str(model_output_path / f"model_{target_col}_{fold}.txt"))

        pred = model.predict(X_val, num_iteration=model.best_iteration)
        oof[val_idx] = pred

        mse = mean_squared_error(y_val, pred)
        mae = mean_absolute_error(y_val, pred)
        print(f"fold: {fold}, target: {target_col}, mae: {mae}")
        loss_table["fold"] = fold
        loss_table["target"] = target_col
        loss_table["mae"] = mae
        loss_table["mse"] = mse

        loss_tables.append(loss_table)
        models.append(model)

    # 評価に使うのは元データのみ
    cond = (~np.isnan(oof)) & (df_train[f"org_{target_col}"].notnull())
    y_true = y[cond]
    y_pred = oof[cond]
    score_mse = mean_squared_error(y_true, y_pred)
    score_mae = mean_absolute_error(y_true, y_pred)
    print(f"RMSE for {target_col}: {score_mse:.4f}")
    print(f"MAE for {target_col}: {score_mae:.4f}")
    mae_dict[target_col] = score_mae

    for loss_table in loss_tables:
        loss_table_wandb.add_data(exp, model_name, loss_table["fold"], loss_table["target"], loss_table["mae"], loss_table["mse"])
    all_loss_tables += loss_tables

    oof_df = pd.DataFrame({
        "id": df_train["id"].values,
        target_col: oof
    })
    oof_dfs.append(oof_df)   

    all_models[target_col] = models

wandb.log({"fold_target_losses": loss_table_wandb})
# target 毎の 平均 mae を記録
for target in targets:
    key_name = f"{target}_mean_mae"
    mae_values = mae_dict[target]
    # mae_values = [d["mae"] for d in all_loss_tables if d["target"] == target]
    wandb.log({key_name: np.mean(mae_values)})

# CV 計算
cond = ~train["is_external"]
oof_df = pd.DataFrame()

# 元の目的変数も入れておく
for target in targets:
    oof_df[f"org_{target}"] = train.loc[cond, f"org_{target}"]

oof_df["id"] = train.loc[cond, "id"]

# 正規化前の SMILES
oof_df["SMILES"] = train.loc[cond, "org_SMILES"]

for i_oof in oof_dfs:
    oof_df = oof_df.merge(i_oof, on="id", how="left")

# # 予測しなかった部分は null にしておく
# for target in targets:
#     oof_df.loc[oof_df[target] == 0, target] = np.nan

oof_df.to_csv(output_path / "oof.csv", index=False)

solution = train.loc[cond, ["id"] + org_target_cols].copy()
solution.columns = ["id"] + targets

# solution = solution.fillna(NULL_FOR_SUBMISSION)

# oof_df = oof_df.fillna(NULL_FOR_SUBMISSION)

# 評価
final_score = score(
    solution=solution,
    submission=oof_df,
)
print(f"\n📊 Final OOF Score (wMAE): {final_score:.6f}")
wandb.log({"wMAE": final_score})

# target 毎の best_iteration を保存する。保存したモデルには記録されてなかった
best_iterations = {}
for target in targets:
    target_best_iterations = [model.best_iteration for model in all_models[target]]
    best_iterations[target] = target_best_iterations
print(best_iterations)

with open(output_path / "best_iterations.json", "w") as f:
    json.dump(best_iterations, f)

0,1
Notes,主鎖のみの情報を追加


cnt error extract main chain: 0


Generating maccs:   0%|          | 0/150 [00:00<?, ?it/s]

Generating descriptors:   0%|          | 0/150 [00:00<?, ?it/s]

mordred desc:   0%|          | 0/150 [00:00<?, ?it/s]

(150, 3038)


Generating maccs:   0%|          | 0/150 [00:00<?, ?it/s]

Generating descriptors:   0%|          | 0/150 [00:00<?, ?it/s]

mordred desc:   0%|          | 0/150 [00:00<?, ?it/s]

(150, 6061)
Unique=1 col: ['maccs_0', 'maccs_1', 'maccs_2', 'maccs_3', 'maccs_4', 'maccs_5', 'maccs_6', 'maccs_7', 'maccs_8', 'maccs_9', 'maccs_10', 'maccs_11', 'maccs_12', 'maccs_13', 'maccs_15', 'maccs_16', 'maccs_18', 'maccs_19', 'maccs_20', 'maccs_22', 'maccs_25', 'maccs_26', 'maccs_27', 'maccs_28', 'maccs_30', 'maccs_31', 'maccs_34', 'maccs_35', 'maccs_44', 'maccs_68', 'maccs_166', 'NumRadicalElectrons', 'PEOE_VSA11', 'PEOE_VSA4', 'PEOE_VSA5', 'SMR_VSA8', 'SlogP_VSA9', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_C_S', 'fr_HOCCN', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_SH', 'fr_aldehyde', 'fr_amidine', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzodiazepine', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_guanido', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_isocyan', 'fr_isothiocyan', 'fr_lactam', 'fr_lactone', 'fr_morpholine', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_quatN', 'fr