In [4]:
import os

import warnings
from functools import reduce
from pathlib import Path

import lightgbm as lgb
import numpy as np
import pandas as pd
from rdkit import Chem, rdBase
from rdkit.Chem import AllChem, Descriptors, MolToSmiles, rdMolDescriptors
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

rdBase.DisableLog("rdApp.warning")
warnings.filterwarnings("ignore")

In [5]:
# Ë©ï‰æ°ÊåáÊ®ô
# https://www.kaggle.com/code/metric/open-polymer-2025
class ParticipantVisibleError(Exception):
    pass


# These values are from the train data.
MINMAX_DICT = {
    "Tg": [-148.0297376, 472.25],
    "FFV": [0.2269924, 0.77709707],
    "Tc": [0.0465, 0.524],
    "Density": [0.748691234, 1.840998909],
    "Rg": [9.7283551, 34.672905605],
}
NULL_FOR_SUBMISSION = -9999


def scaling_error(labels, preds, property):
    error = np.abs(labels - preds)
    min_val, max_val = MINMAX_DICT[property]
    label_range = max_val - min_val
    return np.mean(error / label_range)


def get_property_weights(labels):
    property_weight = []
    for property in MINMAX_DICT.keys():
        valid_num = np.sum(labels[property] != NULL_FOR_SUBMISSION)
        property_weight.append(valid_num)
    property_weight = np.array(property_weight)
    property_weight = np.sqrt(1 / property_weight)
    return (property_weight / np.sum(property_weight)) * len(property_weight)


def score(
    solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str
) -> float:
    """
    Compute weighted Mean Absolute Error (wMAE) for the Open Polymer challenge.

    Expected input:
      - solution and submission as pandas.DataFrame
      - Column 'id': unique identifier for each sequence
      - Columns 'Tg', 'FFV', 'Tc', 'Density', 'Rg' as the predicted targets

    Examples
    --------
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> solution = pd.DataFrame({'id': range(4), 'Tg': [0.2]*4, 'FFV': [0.2]*4, 'Tc': [0.2]*4, 'Density': [0.2]*4, 'Rg': [0.2]*4})
    >>> submission = pd.DataFrame({'id': range(4), 'Tg': [0.5]*4, 'FFV': [0.5]*4, 'Tc': [0.5]*4, 'Density': [0.5]*4, 'Rg': [0.5]*4})
    >>> round(score(solution, submission, row_id_column_name=row_id_column_name), 4)
    0.2922
    >>> submission = pd.DataFrame({'id': range(4), 'Tg': [0.2]*4, 'FFV': [0.2]*4, 'Tc': [0.2]*4, 'Density': [0.2]*4, 'Rg': [0.2]*4} )
    >>> score(solution, submission, row_id_column_name=row_id_column_name)
    0.0
    """
    chemical_properties = list(MINMAX_DICT.keys())
    property_maes = []
    property_weights = get_property_weights(solution[chemical_properties])
    for property in chemical_properties:
        is_labeled = solution[property] != NULL_FOR_SUBMISSION
        property_maes.append(
            scaling_error(
                solution.loc[is_labeled, property],
                submission.loc[is_labeled, property],
                property,
            )
        )

    if len(property_maes) == 0:
        raise RuntimeError("No labels")
    return float(np.average(property_maes, weights=property_weights))

In [52]:
# ---------------------------
# ÂàÜÂ≠êË®òËø∞Â≠ê„ÇíÁîüÊàê„Åô„ÇãÈñ¢Êï∞
# ---------------------------
def compute_all_descriptors(mol):
    if mol is None:
        return [None] * len(desc_names)
    return [desc[1](mol) for desc in Descriptors.descList]


descriptor_names = [desc[0] for desc in Descriptors.descList]


def get_mfp(mol, radius=2, fp_size=1024):
    if mol is None:
        return np.zeros((1, fp_size))
    mfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=fp_size)
    return np.array(list(mfp.ToBitString())).astype(int)


def add_descriptors(df):
    descs = []
    radius = 2
    fp_size = 1024
    mfp_vec = np.empty((len(df), fp_size))

    for idx, smi in enumerate(tqdm(df["SMILES"], desc="Generating descriptors")):
        mol = Chem.MolFromSmiles(smi)
        descs.append(compute_all_descriptors(mol))
        mfp_vec[idx] = get_mfp(mol=mol, radius=radius, fp_size=fp_size)

    desc_df = pd.DataFrame(descs)
    mfp_df = pd.DataFrame(mfp_vec)
    mfp_df.columns = [f"mfp_vec{i}" for i in range(fp_size)]
    df[descriptor_names] = desc_df
    df = pd.concat([df, mfp_df], axis=1).reset_index(drop=True)
    return df


def generate_random_smiles(smiles: str, num_augments: int = 3) -> list:
    """
    Âêå„ÅòÊÑèÂë≥„ÅßË°®Ë®ò„ÅåÁï∞„Å™„Çã SMILES „ÇíÁîüÊàê
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return []
    return [MolToSmiles(mol, doRandom=True) for _ in range(num_augments)]


def augment_smiles_df(df: pd.DataFrame, num_augments: int = 3) -> pd.DataFrame:
    augmented_rows = []

    for _, row in tqdm(
        df.iterrows(), total=len(df), desc=f"Augmenting SMILES x{num_augments}"
    ):
        smiles = row["SMILES"]
        augmented_smiles = generate_random_smiles(smiles, num_augments=num_augments)

        for aug_smi in augmented_smiles:
            new_row = row.copy()
            new_row["SMILES"] = aug_smi
            augmented_rows.append(new_row)

    # ÂÖÉ„ÅÆ„Éá„Éº„Çø„Å®ÁµêÂêà
    augmented_df = pd.DataFrame(augmented_rows)
    return pd.concat([df, augmented_df], ignore_index=True)


# ---------------------------
# „Éá„Éº„ÇøË™≠„ÅøËæº„Åø„Å®ÁâπÂæ¥ÈáèÁîüÊàê
# ---------------------------
def load_data(data_dir):
    train = pd.read_csv(data_dir / "neurips-open-polymer-prediction-2025/train.csv")
    test = pd.read_csv(data_dir / "neurips-open-polymer-prediction-2025/test.csv")

    return train, test


def add_external_data(train, data_path):
    """
    Â≠¶Áøí„Éá„Éº„Çø„Å´Â§ñÈÉ®„Éá„Éº„Çø„ÇíÁ™ÅÂêà„Åô„Çã
    """
    ex_tg_df = pd.read_csv(
        data_path / "smiles-tg/Tg_SMILES_class_pid_polyinfo_median.csv"
    )[["SMILES", "Tg"]]
    ex_tc_df = pd.read_csv(data_path / "tc-smiles/Tc_SMILES.csv")
    ex_tc_df.columns = ["Tc", "SMILES"]

    # „Çπ„Ç≥„Ç¢ÊÇ™Âåñ
    # train_merged = pd.concat([train, ex_tg_df, ex_tc_df])

    # ex „Éá„Éº„Çø„Çí SMILES ÊØé„Å´‰∏ÄÊÑèÂåñ
    ex_tg_df = ex_tg_df.groupby("SMILES")["Tg"].min().reset_index()
    ex_tc_df = ex_tc_df.groupby("SMILES")["Tc"].min().reset_index()

    # Tg, Tc Â§ñÈÉ®„Éá„Éº„Çø
    # train „Å´Âê´„Åæ„Çå„Å¶„ÅÑ„Çã SMILES „Åå„ÅÇ„Çå„Å∞„ÄÅTg, Tc „Çí‰∏äÊõ∏„Åç„Åô„Çã
    # TODO: test „ÅßÂê´„Åæ„Çå„Å¶„Åü„Çâ‰Ωø„ÅÜÔºü
    train_merged = train.merge(ex_tg_df, how="left", on="SMILES", suffixes=("", "_ex"))
    train_merged = train_merged.merge(
        ex_tc_df, how="left", on="SMILES", suffixes=("", "_ex")
    )
    # train_merged["org_Tg"] = train_merged["Tg"]
    # train_merged["org_Tc"] = train_merged["Tc"]

    # train_merged["Tg"] = np.where(~train_merged["Tg_ex"].isnull(), train_merged["Tg_ex"], train_merged["Tg"])
    # train_merged["Tc"] = np.where(~train_merged["Tc_ex"].isnull(), train_merged["Tc_ex"], train_merged["Tc"])
    train_merged["Tg"] = np.where(
        ~train_merged["Tg"].isnull(), train_merged["Tg"], train_merged["Tg_ex"]
    )
    train_merged["Tc"] = np.where(
        ~train_merged["Tc"].isnull(), train_merged["Tc"], train_merged["Tc_ex"]
    )

    # train „Å´Âê´„Åæ„Çå„Å¶„ÅÑ„Å™„Åë„Çå„Å∞ concat „ÅßËøΩÂä†„Åô„Çã
    cond_include_ex_tg = train["SMILES"].isin(ex_tg_df["SMILES"].values)
    cond_include_ex_tc = train["SMILES"].isin(ex_tc_df["SMILES"].values)

    train_merged = pd.concat([train_merged, ex_tg_df[~cond_include_ex_tg]])
    train_merged = pd.concat([train_merged, ex_tc_df[~cond_include_ex_tc]])
    train_merged.drop(["Tg_ex", "Tc_ex"], axis=1, inplace=True)

    return train_merged.reset_index(drop=True)


def add_external_data2(train, data_path):
    # https://www.kaggle.com/datasets/minatoyukinaxlisa/tc-smiles
    data_tc = pd.read_csv(data_path / "tc-smiles/Tc_SMILES.csv")
    data_tc = data_tc.rename(columns={"TC_mean": "Tc"})

    # https://springernature.figshare.com/articles/dataset/dataset_with_glass_transition_temperature/24219958?file=42507037
    data_tg2 = pd.read_csv(
        data_path / "smiles-extra-data/JCIM_sup_bigsmiles.csv",
        usecols=["SMILES", "Tg (C)"],
    )
    data_tg2 = data_tg2.rename(columns={"Tg (C)": "Tg"})

    # https://www.sciencedirect.com/science/article/pii/S2590159123000377#ec0005
    data_tg3 = pd.read_excel(data_path / "smiles-extra-data/data_tg3.xlsx")
    data_tg3 = data_tg3.rename(columns={"Tg [K]": "Tg"})
    data_tg3["Tg"] = data_tg3["Tg"] - 273.15

    # https://github.com/Duke-MatSci/ChemProps
    data_dnst = pd.read_excel(data_path / "smiles-extra-data/data_dnst1.xlsx")
    data_dnst = data_dnst.rename(columns={"density(g/cm3)": "Density"})[
        ["SMILES", "Density"]
    ]
    data_dnst["SMILES"] = data_dnst["SMILES"].apply(lambda s: make_smile_canonical(s))
    data_dnst = data_dnst[
        (data_dnst["SMILES"].notnull())
        & (data_dnst["Density"].notnull())
        & (data_dnst["Density"] != "nylon")
    ]
    data_dnst["Density"] = data_dnst["Density"].astype("float64")
    data_dnst["Density"] -= 0.118

    def add_extra_data(df_train, df_extra, target):
        n_samples_before = len(df_train[df_train[target].notnull()])

        df_extra["SMILES"] = df_extra["SMILES"].apply(lambda s: make_smile_canonical(s))
        df_extra = df_extra.groupby("SMILES", as_index=False)[target].mean()
        cross_smiles = set(df_extra["SMILES"]) & set(df_train["SMILES"])
        unique_smiles_extra = set(df_extra["SMILES"]) - set(df_train["SMILES"])

        # Make priority target value from competition's df
        for smile in df_train[df_train[target].notnull()]["SMILES"].tolist():
            if smile in cross_smiles:
                cross_smiles.remove(smile)

        # Imput missing values for competition's SMILES
        for smile in cross_smiles:
            df_train.loc[df_train["SMILES"] == smile, target] = df_extra[
                df_extra["SMILES"] == smile
            ][target].values[0]

        df_train = pd.concat(
            [df_train, df_extra[df_extra["SMILES"].isin(unique_smiles_extra)]], axis=0
        ).reset_index(drop=True)

        n_samples_after = len(df_train[df_train[target].notnull()])
        print(
            f'\nFor target "{target}" added {n_samples_after-n_samples_before} new samples!'
        )
        print(f"New unique SMILES: {len(unique_smiles_extra)}")
        return df_train

    train = add_extra_data(train, data_tc, "Tc")
    train = add_extra_data(train, data_tg2, "Tg")
    train = add_extra_data(train, data_tg3, "Tg")
    train = add_extra_data(train, data_dnst, "Density")
    return train


def preprocess(df):
    f32_max = np.finfo(np.float32).max
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df[df > f32_max] = np.nan
    df[df < -f32_max] = np.nan

    # impute_values = df.mean()
    # df.fillna(impute_values, inplace=True)

    return df


def make_smile_canonical(smile): # To avoid duplicates, for example: canonical '*C=C(*)C' == '*C(=C*)C'
    try:
        mol = Chem.MolFromSmiles(smile)
        canon_smile = Chem.MolToSmiles(mol, canonical=True)
        return canon_smile
    except:
        return np.nan
        
# ---------------------------
# LightGBM „É¢„Éá„É´„ÅÆÂ≠¶Áøí
# ---------------------------
def train_lgb_for_target(train, test, target_col, features, n_splits=5):
    print(f"\n=== Training for target: {target_col} ===")

    df_train = train[train[target_col].notnull()].reset_index(drop=True)
    df_test = test.copy()

    X = df_train[features]
    y = df_train[target_col]
    X_test = df_test[features]

    # X = preprocess(X)
    # X_test = preprocess(X_test)

    preds_test = np.zeros(len(X_test))
    oof = np.zeros(len(X))

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        X_train, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        # Scale features
        # scaler = StandardScaler()
        # X_train = scaler.fit_transform(X_train)
        # X_val = scaler.transform(X_val)
        # X_test = scaler.transform(X_test)

        dtrain = lgb.Dataset(X_train, label=y_train)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
        params = {
            "objective": "regression",
            "metric": "mae",
            "verbosity": -1,
            "learning_rate": 0.01,
            "max_depth": 7,
            "seed": 42,
            "subsample": 0.7,
            "colsample_bytree": 0.6,
            "num_boost_round": 10_000,
        }

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dval],
            callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(200)],
        )

        oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        preds_test += (
            model.predict(X_test, num_iteration=model.best_iteration) / n_splits
        )

    score_mse = mean_squared_error(y, oof)
    score_mae = mean_absolute_error(y, oof)
    print(f"RMSE for {target_col}: {score_mse:.4f}")
    print(f"MAE for {target_col}: {score_mae:.4f}")

    return preds_test, oof, df_train["id"].values

useless_cols = [    
    # Nan data
    'BCUT2D_MWHI',
    'BCUT2D_MWLOW',
    'BCUT2D_CHGHI',
    'BCUT2D_CHGLO',
    'BCUT2D_LOGPHI',
    'BCUT2D_LOGPLOW',
    'BCUT2D_MRHI',
    'BCUT2D_MRLOW',

    # Constant data
    'NumRadicalElectrons',
    'SMR_VSA8',
    'SlogP_VSA9',
    'fr_barbitur',
    'fr_benzodiazepine',
    'fr_dihydropyridine',
    'fr_epoxide',
    'fr_isothiocyan',
    'fr_lactam',
    'fr_nitroso',
    'fr_prisulfonamd',
    'fr_thiocyan',

    # High correlated data >0.95
    'MaxEStateIndex',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumValenceElectrons',
    'Chi0',
    'Chi0n',
    'Chi0v',
    'Chi1',
    'Chi1n',
    'Chi1v',
    'Chi2n',
    'Kappa1',
    'LabuteASA',
    'HeavyAtomCount',
    'MolMR',
    'Chi3n',
    'BertzCT',
    'Chi2v',
    'Chi4n',
    'HallKierAlpha',
    'Chi3v',
    'Chi4v',
    'MinAbsPartialCharge',
    'MinPartialCharge',
    'MaxAbsPartialCharge',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'Phi',
    'Kappa3',
    'fr_nitrile',
    'SlogP_VSA6',
    'NumAromaticCarbocycles',
    'NumAromaticRings',
    'fr_benzene',
    'VSA_EState6',
    'NOCount',
    'fr_C_O',
    'fr_C_O_noCOO',
    'NumHDonors',
    'fr_amide',
    'fr_Nhpyrrole',
    'fr_phenol',
    'fr_phenol_noOrthoHbond',
    'fr_COO2',
    'fr_halogen',
    'fr_diazo',
    'fr_nitro_arom',
    'fr_phos_ester'
]

# ---------------------------
# „É°„Ç§„É≥Âá¶ÁêÜ
# ---------------------------
exp = "exp011"
debug = False
n_splits = 5 if not debug else 2

if debug:
    output_path = Path("../tmp") / exp
else:
    output_path = Path("../outputs") / exp

is_kaggle_notebook = os.path.exists("/kaggle/input")
if is_kaggle_notebook:
    # kaggle notebook
    data_dir = Path("/kaggle/input")
    debug = False
else:
    # local
    data_dir = Path("../data/raw")

if is_kaggle_notebook:
    train, test = load_data(data_dir)

    train['SMILES'] = train['SMILES'].apply(lambda s: make_smile_canonical(s))
    test['SMILES'] = test['SMILES'].apply(lambda s: make_smile_canonical(s))
    train = add_external_data2(train, data_dir)
    train = add_descriptors(train)
    test = add_descriptors(test)
else:
    train, test = load_data(data_dir)

    if os.path.exists(output_path / "train.csv"):
        train = pd.read_csv(output_path / "train.csv")
    else:
        train['SMILES'] = train['SMILES'].apply(lambda s: make_smile_canonical(s))
        train = add_external_data2(train, data_dir)
        train = add_descriptors(train)

        os.makedirs(output_path, exist_ok=True)
        train.to_csv(output_path / "train.csv", index=False)
        print("Saved train.csv")
        
    test['SMILES'] = test['SMILES'].apply(lambda s: make_smile_canonical(s))
    test = add_descriptors(test)

# „É©„É≥„ÉÄ„É† SMILES „Çí3ÂÄç„Å´Êã°ÂºµÔºà= ÂÖÉ„ÅÆ + 3ÂÄç = 4ÂÄç„ÅÆ„Éá„Éº„ÇøÈáèÔºâ
# train = augment_smiles_df(train, num_augments=3)
# train["org_id"] = train["id"]
train["id"] = np.arange(len(train))
targets = ["Tg", "FFV", "Tc", "Density", "Rg"]

oof_dfs = []

all_features = [col for col in train.columns if col not in ["id", "SMILES"] + targets + useless_cols]
features = {}
for target in targets:
    t_features = []
    for col in all_features:
        if train[train[target].notnull()][col].nunique() == 1:
            continue
        t_features.append(col)
    features[target] = t_features

for idx, target in enumerate(targets):
    preds, oof, ids = train_lgb_for_target(train, test, target, features[target], n_splits)
    test[target] = preds

    oof_df = pd.DataFrame({"id": ids, target: oof})
    oof_dfs.append(oof_df)

if is_kaggle_notebook:
    # Â§ñÈÉ®„Éá„Éº„Çø„Å´ test „Éá„Éº„Çø„ÅåÂê´„Åæ„Çå„Å¶„ÅÑ„Åü„Çâ„ÄÅ„Åù„Çå„Çí‰Ωø„ÅÜÔºàLB 0.01 „Åè„Çâ„ÅÑÂêë‰∏äÔºâ
    for target in targets:
        for s in train[train[target].notnull()]['SMILES']:
            if s in test['SMILES'].tolist():
                test.loc[test['SMILES']==s, target] = train[train['SMILES']==s][target].values[0]
    
    test[["id"] + targets].to_csv("submission.csv", index=False)
    print("‚úÖ submission saved to submission.csv")
else:
    # CV Ë®àÁÆó
    oof_df = pd.DataFrame()
    oof_df["id"] = train["id"]
    for i_oof in oof_dfs:
        oof_df = oof_df.merge(i_oof, on="id", how="left")
    solution = train[["id"] + targets].copy()
    # solution = solution.fillna(NULL_FOR_SUBMISSION)
    # oof_df = oof_df.fillna(NULL_FOR_SUBMISSION)

    # Ë©ï‰æ°
    final_score = score(solution=solution, submission=oof_df, row_id_column_name="id")
    print(f"\nüìä Final OOF Score (wMAE): {final_score:.6f}")

Generating descriptors: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 61.69it/s]



=== Training for target: Tg ===
Training until validation scores don't improve for 50 rounds
[200]	training's l1: 31.1379	valid_1's l1: 43.0041
[400]	training's l1: 20.4549	valid_1's l1: 37.5619
[600]	training's l1: 17.3725	valid_1's l1: 36.9699
Early stopping, best iteration is:
[637]	training's l1: 16.8384	valid_1's l1: 36.8987
Training until validation scores don't improve for 50 rounds
[200]	training's l1: 31.2702	valid_1's l1: 40.0346
[400]	training's l1: 20.0924	valid_1's l1: 36.2922
[600]	training's l1: 16.8349	valid_1's l1: 35.7204
Early stopping, best iteration is:
[614]	training's l1: 16.7055	valid_1's l1: 35.7009
Training until validation scores don't improve for 50 rounds
[200]	training's l1: 31.3675	valid_1's l1: 40.5559
[400]	training's l1: 21.0025	valid_1's l1: 36.0889
[600]	training's l1: 17.7453	valid_1's l1: 35.5029
[800]	training's l1: 15.5776	valid_1's l1: 35.258
Early stopping, best iteration is:
[880]	training's l1: 14.7937	valid_1's l1: 35.1438
Training until va

### 7/1 10:00
üìä Final OOF Score (wMAE): 0.047197

In [4]:
tmp = train.merge(oof_df, how="left", on="id", suffixes=("", "_oof"))
for col in targets:
    print(f"====================={col}=====================")
    tmp[f"{col}_absolute_error"] = np.abs(tmp[col] - tmp[f"{col}_oof"])
    cond = ~tmp[col].isnull()
    tmp2 = tmp.loc[cond, ["id", "SMILES", col, f"{col}_oof", f"{col}_absolute_error"]]
    display(tmp2.sort_values(f"{col}_absolute_error").head(10))
    display(tmp2.sort_values(f"{col}_absolute_error").tail(10))



Unnamed: 0,id,SMILES,Tg,Tg_oof,Tg_absolute_error
6084,6084,*CC(O)COc1ccc(C(C)(C)c2ccc(OCC(O)COc3c(C)cc(S(...,137.5,137.497224,0.002776
6967,6967,*CC(*)OC(=O)c1cccc(OC)c1,44.0,44.003061,0.003061
1726,1726,*c1ccc(Oc2c(C)cc(C(c3cccnc3)c3cc(C)c(Oc4ccc(N5...,217.0,217.010185,0.010185
5330,5330,*O[Si](C)(CCC(F)(F)F)CCC(F)(F)C(F)(F)C(F)(F)C(...,-25.0,-25.014546,0.014546
4493,4493,*C(=O)Nc1ccc(C(=O)c2ccc(NC(=O)c3ccc4c(c3)C(=O)...,212.0,212.028602,0.028602
3731,3731,*c1nc(-c2ccccc2)nc(N(C)CCCCCCN(*)C)n1,57.0,56.955632,0.044368
1665,1665,*C1CCC(CC2CCC(N3C(=O)C4C5C=CC(C6C(=O)N(*)C(=O)...,241.0,240.952732,0.047268
7843,7843,*CCOCCOc1cccc(NC(=O)c2cccc(C(=O)Nc3cccc(O*)c3)...,128.5,128.433899,0.066101
117,117,*c1ccc2c(c1)C(=O)N(c1cccc(C(=O)Nc3ccc(Oc4ccc(C...,266.0,265.927772,0.072228
5325,5325,*Oc1ccc(C(C)(C)c2ccc(Oc3nc(*)nc(OC)n3)cc2)cc1,121.209457,121.281848,0.072391


Unnamed: 0,id,SMILES,Tg,Tg_oof,Tg_absolute_error
9890,9890,*c1c(C)cc(C)c(N2C(=O)c3ccc(S(=O)(=O)c4ccc5c(c4...,418.0,144.887023,273.112977
9894,9894,*c1c(C)cc(Cc2cc(C)c(N3C(=O)c4ccc(Oc5c(C)cc(-c6...,420.0,145.373408,274.626592
9886,9886,*c1c(C)cc(-c2cc(C)c(N3C(=O)c4ccc(Oc5c(C)cc(Cc6...,420.0,144.617846,275.382154
9966,9966,*c1ccc(-c2ccc(-c3nc4ccc(-c5ccc6nc(*)c(-c7ccccc...,420.0,144.617846,275.382154
9374,9374,*O[Si](*)(C)CC,-133.0,143.798298,276.798298
10279,10279,*c1ccc(S(=O)(=O)c2ccc(-n3nc(-c4ccccc4)c4ccc(S(...,422.0,144.918799,277.081201
9994,9994,*c1ccc(-c2nc3cc(-c4ccc5oc(*)nc5c4)ccc3o2)cc1,430.0,144.887023,285.112977
7376,7376,*CC(CC)(CO)CO*,331.9,21.784434,310.115566
10081,10081,*c1ccc(N2C(=O)c3c(c(-c4ccccc4)c(-c4ccc(-c5c(-c...,466.0,144.918799,321.081201
3035,3035,*C(=O)Oc1ccc(OC(=O)c2ccc3c(c2)C(=O)N(c2ccc(N4C...,-71.681071,268.546038,340.227109




Unnamed: 0,id,SMILES,FFV,FFV_oof,FFV_absolute_error
7451,7451,*CCOCCOCCOc1cccc(NC(=O)c2cccc(C(=O)Nc3cccc(O*)...,0.337858,0.337858,6.823045e-07
2458,2458,*CCc1ccc(NC(=O)c2cccc(C(=O)Nc3ccc(CCOC(=O)c4cc...,0.343401,0.343399,1.807577e-06
2370,2370,*CC(*)C(=O)Oc1c(F)c(F)c(F)c(F)c1F,0.339735,0.339738,2.858998e-06
6378,6378,*Oc1ccc(NC(=O)CCCCCCCCCC(=O)Nc2ccc(*)cc2)cc1,0.349997,0.349993,3.805442e-06
5572,5572,*CCc1ccc(*)c(CC)c1,0.394865,0.394861,4.098588e-06
6029,6029,*c1ccc(Oc2ccc(Sc3ccc(Oc4ccc(N5C(=O)c6ccc(Oc7cc...,0.366622,0.366617,4.942816e-06
4639,4639,*Nc1ccc(-c2c3ccccc3c(-c3ccc(N*)cc3)c3ccccc23)cc1,0.358288,0.358293,5.054246e-06
7043,7043,*Nc1ccc2cccc3c2c1C(CC)=C(CC)[C@@H]3N*,0.352788,0.352795,6.9843e-06
1174,1174,*CC(=O)Nc1ccc(Oc2ccc(-c3ccc(Oc4ccc(NC(=O)CN5C(...,0.34625,0.346242,7.787461e-06
6251,6251,*Oc1ccc(S(=O)(=O)c2ccc(Oc3c(C)cc(-c4cc(C)c(*)c...,0.393397,0.393405,7.86231e-06


Unnamed: 0,id,SMILES,FFV,FFV_oof,FFV_absolute_error
4223,4223,*CC(*)CNc1ccc([N+](=O)[O-])cc1[N+](=O)[O-],0.484211,0.383736,0.100475
4565,4565,*CC(*)NC=O,0.274937,0.384265,0.109328
7551,7551,*CC(*)(C)c1nc(N(CCOCCOCCOC)CCOCCOCCOC)nc(N(CCO...,0.359632,0.47259,0.112958
6689,6689,*CC(*)(C)C(=O)OCCO[N+](=O)[O-],0.495595,0.336967,0.158628
633,633,*c1ccc(*)[nH]1,0.563897,0.367668,0.196229
2338,2338,*CC(*)C(=O)NCCC[N+](C)(C)C,0.687057,0.409319,0.277738
2626,2626,*CC(*)C(=O)N1CC[NH+](CC)CC1,0.650907,0.353361,0.297546
645,645,*CCCCCCCCCC[N+](*)(C)C,0.769315,0.439579,0.329736
6064,6064,*CCC1C[N+](C)(C)CC1*,0.777097,0.408812,0.368286
2966,2966,*CC(*)O[N+](=O)[O-],0.774066,0.330291,0.443775




Unnamed: 0,id,SMILES,Tc,Tc_oof,Tc_absolute_error
7181,7181,*CCCCCCCCCCSCCCCS*,0.282,0.282123,0.000123
1494,1494,*CCCCCCCCC(=O)NCCCCCCNC(=O)CCCCO*,0.3185,0.31836,0.00014
4911,4911,*CCC(C(=O)OCC)C(*)C(=O)OCC,0.2075,0.207654,0.000154
131,131,*/C(F)=C(\F)C(F)(C(*)(F)F)C(F)(F)F,0.102,0.101815,0.000185
7572,7572,*CCCCCCSCCCCCS*,0.241,0.241227,0.000227
6935,6935,*CC(*)c1ccc(CCCCCCCCCCCCCC)cc1,0.34,0.339768,0.000232
2860,2860,*CCc1ccc(CCNC(=O)CCCCCCCCCCCCCCCC(=O)N*)cc1,0.3515,0.351744,0.000244
511,511,*CC(*)(C)C,0.2125,0.212779,0.000279
10960,10960,*CCCCCCCCCCCCCCOC(=O)c1ccc(C(=O)NCCCCCNC(=O)c2...,0.258,0.257717,0.000283
11072,11072,*CCCCNC(=O)CC/C=C/CCC(=O)N*,0.258,0.257717,0.000283


Unnamed: 0,id,SMILES,Tc,Tc_oof,Tc_absolute_error
11080,11080,*CCCCOC(=O)CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,0.49475,0.25963,0.23512
5836,5836,*/C=C/c1cc(OCCCCCC)c(*)cc1OC,0.524,0.282577,0.241423
11223,11223,*c1ccc2c(c1)SC1=Nc3cc(-c4ccc5c(c4)N=C4Sc6cc(*)...,0.506,0.258633,0.247367
10931,10931,*CCCCCCCCCCCCCCCCCCCCOC(=O)CCCCCCCC(=O)O*,0.507,0.256747,0.250253
780,780,*c1ccc2c(c1)SC1=Nc3cc(-c4ccc5c(c4)N=C4Sc6cc(*)...,0.506,0.241195,0.264805
10492,10492,*/C=C/c1cc(OCCCCCC)c(*)cc1OC,0.524,0.257717,0.266283
11180,11180,*c1ccc(*)c2ccccc12,0.571,0.258633,0.312367
11226,11226,*c1ccc2ccc3c(*)cc(C#C)c4ccc1c2c34,0.582,0.256747,0.325253
11229,11229,*c1cccc2c(*)cccc12,0.685,0.255155,0.429845
11224,11224,*c1ccc2cc(*)ccc2c1,1.59,0.255155,1.334845




Unnamed: 0,id,SMILES,Density,Density_oof,Density_absolute_error
2949,2949,*CC(*)(C)C(=O)Oc1ccccc1,1.049796,1.049759,3.7e-05
1426,1426,*CCCCCNC(=O)CCCCCCCCCCCCC(=O)N*,0.933437,0.933486,4.9e-05
331,331,*CC(*)C(=O)Oc1ccc(C)cc1,1.051609,1.051555,5.5e-05
7636,7636,*CC(*)OC(=O)CCCCCCCCCCC,0.878322,0.878209,0.000113
5864,5864,*CCCCCC(*)CCCCCCCCCCCCCC,0.804432,0.804552,0.00012
3314,3314,*CC(*)(C)C(C)=O,0.887509,0.887363,0.000146
779,779,*CCCCCCCCCOC(=O)CCCCCCCC(=O)O*,0.94187,0.942046,0.000176
2882,2882,*C1CCC(*)C1,0.903469,0.903287,0.000182
6131,6131,*CCCCCCCCCCCCOC(=O)CCCCCCC(=O)O*,0.931066,0.931257,0.00019
4838,4838,*CC(*)(C)C(=O)OCCCCCCCCCCCC,0.876334,0.876123,0.000211


Unnamed: 0,id,SMILES,Density,Density_oof,Density_absolute_error
4442,4442,*C*,0.811251,1.061682,0.25043
1646,1646,*CC(*)(C)C(=O)OCCBr,1.475127,1.176734,0.298393
1857,1857,*CC(*)(C)C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(...,1.482039,1.08953,0.39251
2209,2209,*CC(*)(C)C(=O)NC(=O)OC(C)COc1c(Br)cc(S(=O)(=O)...,1.587308,1.181387,0.40592
918,918,*CC(*)(C)C(=O)OCCN(CC)S(=O)(=O)C(F)(F)C(F)(F)C...,1.499458,1.085635,0.413823
5096,5096,*CC(*)(C)C(=O)OC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F...,1.517768,1.099401,0.418367
2583,2583,*ON(C(F)(F)F)C(F)(F)C(*)(F)F,1.732296,1.292714,0.439582
1283,1283,*CC(*)(F)C(=O)OCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,1.608016,1.072822,0.535194
7805,7805,*SC(*)(F)F,1.840999,1.266041,0.574958
4778,4778,*CC(*)OC(=O)C1(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,0.811906,1.4976,0.685695




Unnamed: 0,id,SMILES,Rg,Rg_oof,Rg_absolute_error
7359,7359,*CCCCCCCCCCCCCNC(=O)CCCCCCCCCCCC(=O)N*,22.203731,22.19618,0.007551
7713,7713,*CC(*)C(=O)Oc1ccc(Cl)cc1,14.079623,14.090437,0.010814
6392,6392,*CC(*)c1ccc(COCCOCCCCCCCC)cc1,11.549205,11.535456,0.013749
3010,3010,*CCCCCCCCC(=O)NCCCCCOCCCCCNC(=O)CCCCO*,19.449975,19.464275,0.0143
6912,6912,*CC(*)OC(=O)CCCCCCCCCCCCCCC,12.089689,12.073633,0.016056
2500,2500,*CCCC(*)(C)C,15.783712,15.767582,0.01613
1715,1715,*CC(*)(C)C(=O)OCCOC,13.618715,13.638315,0.019599
3587,3587,*CC(*)C(=O)OCCCCCCOc1ccc(C(=O)Oc2ccc(C#N)cc2)cc1,13.394951,13.365481,0.02947
7218,7218,*c1cccc(*)c1,20.590231,20.623191,0.032959
64,64,*CC(*)C(=O)Oc1ccccc1,13.435339,13.401671,0.033668


Unnamed: 0,id,SMILES,Rg,Rg_oof,Rg_absolute_error
4611,4611,*c1cc(O)c(O)cc1*,13.851205,21.636435,7.78523
885,885,*C(=O)C(*)(C)C,25.849714,17.566739,8.282975
1281,1281,*c1ccc(-c2ccc(C3(*)CCCCC3)cc2)cc1,16.14335,24.435253,8.291903
690,690,*CCc1ccc(-c2ccc(*)cc2)cc1,30.036626,21.086579,8.950047
5624,5624,*Oc1ccc(-c2ccc(-c3cc(-c4ccccc4)c(-c4ccc(-c5ccc...,27.638529,17.892384,9.746145
5872,5872,*CCC1CCC(*)C1,25.786697,15.994365,9.792332
6866,6866,*c1c(-c2ccccc2)c(-c2ccccc2)c(*)c2cc(C3(c4ccc(C...,11.549762,22.029627,10.479866
1027,1027,*=C=C=C(Cn1c2ccccc2c2ccccc21)C(=*)Cn1c2ccccc2c...,12.215031,23.106943,10.891912
3407,3407,*/C=C/*,34.672906,20.053114,14.619791
5897,5897,*c1ccc2ccc3c(*)cc(C#CC=C)c4ccc1c2c34,34.487303,18.161678,16.325625
