# Peek at data

In [1]:
import polars as pl

In [3]:
train_df = pl.read_csv('train.csv')
train_df.head()

id,SMILES,Tg,FFV,Tc,Density,Rg
i64,str,f64,f64,f64,f64,f64
87817,"""*CC(*)c1ccccc1C(=O)OCCCCCC""",,0.374645,0.205667,,
106919,"""*Nc1ccc([C@H](CCC)c2ccc(C3(c4c…",,0.3704102,,,
388772,"""*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(…",,0.37886,,,
519416,"""*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c…",,0.3873239,,,
539187,"""*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCO…",,0.35547,,,


In [5]:
extra_tc_df = pl.read_csv('train_supplement/dataset1.csv')
extra_tc_df.head()

SMILES,TC_mean
str,f64
"""*/C(=C(\c1ccccc1)c1ccc(*)cc1)c…",0.338
"""*/C(F)=C(\F)C(F)(C(*)(F)F)C(F)…",0.102
"""*/C=C(/*)C#CCCCCCCCCCCCCCCCCCC…",0.4105
"""*/C=C(/*)CCCCCCCCCCCCCCCCCCCCC…",0.403
"""*/C=C/*""",0.526


In [6]:
extra_tg_df = pl.read_csv('train_supplement/dataset3.csv')
extra_tg_df.head()

SMILES,Tg
str,f64
"""*=Nc1ccc(N=C(C)Nc2ccc(-c3ccc(N…",89.380459
"""*C(=O)OC(=O)COc1ccc(OCC(=O)OC(…",155.970957
"""*C(=O)c1ccc(C(=O)c2ccc(C=C3CCC…",192.209684
"""*C=C(*)c1ccc(OCCCCCC(=O)Oc2c(F…",73.831985
"""*C=CC1C=CC(*)c2ccc(CCCCCC)cc21""",9.704073


In [7]:
extra_ffv_df = pl.read_csv('train_supplement/dataset4.csv')
extra_ffv_df.head()

SMILES,FFV
str,f64
"""*C(=O)NNC(=O)c1ccc([Si](c2cccc…",0.372725
"""*C(=O)NNC(=O)c1ccc([Si](c2cccc…",0.365478
"""*C(=O)Nc1cc(NC(=O)c2ccc3[nH]c(…",0.376377
"""*C(=O)Nc1ccc(-c2cc(-c3ccccc3)c…",0.376939
"""*C(=O)Nc1ccc(-c2ccc(NC(=O)c3cc…",0.355235


# Deduplicate & merge "extra" data

In [12]:
import polars as pl
from rdkit import Chem
from typing import Optional
from functools import reduce

# ---------- helpers ----------
def canonicalise_smiles(smiles: str) -> Optional[str]:
    """Return RDKit canonical SMILES or None if the string is not parseable."""
    molecule = Chem.MolFromSmiles(smiles)
    return None if molecule is None else Chem.MolToSmiles(
        molecule, canonical=True, isomericSmiles=True
    )

def load_and_canonicalise(path: str, smiles_column: str = "SMILES") -> pl.DataFrame:
    """
    Read a CSV with a SMILES column, canonicalise every entry, drop unparsable rows,
    and de-duplicate within the file.
    """
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col(smiles_column)
            .map_elements(canonicalise_smiles, return_dtype=pl.String)
            .alias("SMILES")
        )
        .drop_nulls("SMILES")          # remove rows RDKit failed to parse
        .unique(subset=["SMILES"])     # remove duplicates inside this file
    )

def outer_join_on_smiles(frames: list[pl.DataFrame]) -> pl.DataFrame:
    """Outer-join a list of data-frames on the SMILES column, coalescing duplicates."""
    return reduce(
        lambda left, right: left.join(
            right,
            on="SMILES",
            how="outer",
            coalesce=True          # <- keep only one copy of the join key
        ),
        frames,
    )

# ---------- main training data ----------
train_df = load_and_canonicalise("train.csv")
train_smiles_set: set[str] = set(train_df.get_column("SMILES"))  # for fast membership checks

# ---------- supplemental datasets ----------
extra_tc_df  = load_and_canonicalise("train_supplement/dataset1.csv").rename({"TC_mean": "Tc"})
extra_tg_df  = load_and_canonicalise("train_supplement/dataset3.csv")          # already has 'Tg'
extra_ffv_df = load_and_canonicalise("train_supplement/dataset4.csv")          # already has 'FFV'

# ---------- outer-join the three supplemental sources ----------
# extra_df = outer_join_on_smiles([extra_tc_df, extra_tg_df, extra_ffv_df])
extra_df = outer_join_on_smiles([extra_tg_df, extra_ffv_df])

# ---------- drop rows that overlap the main training set ----------
extra_df = extra_df.filter(~pl.col("SMILES").is_in(train_smiles_set))

# ---------- add the remaining label columns, filled with null ----------
for missing_column in ["Density", "Rg"]:
    extra_df = extra_df.with_columns(
        pl.lit(None).cast(pl.Float64).alias(missing_column)
    )

# ---------- reorder columns to mirror the training data ----------
desired_column_order = ["SMILES", "Tg", "FFV", "Tc", "Density", "Rg"]
extra_df = extra_df.select(desired_column_order)

# ---------- done ----------
print(extra_df.head())

shape: (5, 6)
┌─────────────────────────────────┬──────┬──────────┬───────┬─────────┬──────┐
│ SMILES                          ┆ Tg   ┆ FFV      ┆ Tc    ┆ Density ┆ Rg   │
│ ---                             ┆ ---  ┆ ---      ┆ ---   ┆ ---     ┆ ---  │
│ str                             ┆ f64  ┆ f64      ┆ f64   ┆ f64     ┆ f64  │
╞═════════════════════════════════╪══════╪══════════╪═══════╪═════════╪══════╡
│ *Oc1ccc(/N=C/C=N/c2ccc(OC(=O)N… ┆ null ┆ null     ┆ 0.349 ┆ null    ┆ null │
│ *CC(F)(F)C1(F)CC(C(O)(C(F)(F)F… ┆ null ┆ 0.320846 ┆ 0.109 ┆ null    ┆ null │
│ *CCCCOC(=O)CCCCCCCC(=O)O*       ┆ null ┆ null     ┆ 0.241 ┆ null    ┆ null │
│ *CCCCCCCCCCc1ccc(-c2c(-c3ccccc… ┆ null ┆ 0.398266 ┆ 0.21  ┆ null    ┆ null │
│ *c1ccc(-c2ccc(-c3sc(*)c(CCCCCC… ┆ null ┆ null     ┆ 0.299 ┆ null    ┆ null │
└─────────────────────────────────┴──────┴──────────┴───────┴─────────┴──────┘


(Deprecated in version 0.20.29)
  lambda left, right: left.join(


In [14]:
extra_df.describe()

statistic,SMILES,Tg,FFV,Tc,Density,Rg
str,str,f64,f64,f64,f64,f64
"""count""","""999""",46.0,862.0,129.0,0.0,0.0
"""null_count""","""0""",953.0,137.0,870.0,999.0,999.0
"""mean""",,135.695015,0.365336,0.258497,,
"""std""",,103.846834,0.024155,0.152331,,
"""min""","""*/C(=C(\c1ccccc1)c1ccc(*)cc1)c…",-7.212269,0.28114,0.091,,
"""25%""",,54.907283,0.350098,0.198,,
"""50%""",,118.836135,0.361871,0.235,,
"""75%""",,198.902674,0.375607,0.294,,
"""max""","""*c1sc(-c2cc(CCCCCCCCCC)c(*)s2)…",421.982243,0.525164,1.59,,


In [15]:
extra_df.describe()

statistic,SMILES,Tg,FFV,Tc,Density,Rg
str,str,f64,f64,f64,f64,f64
"""count""","""999""",46.0,862.0,129.0,0.0,0.0
"""null_count""","""0""",953.0,137.0,870.0,999.0,999.0
"""mean""",,135.695015,0.365336,0.258497,,
"""std""",,103.846834,0.024155,0.152331,,
"""min""","""*/C(=C(\c1ccccc1)c1ccc(*)cc1)c…",-7.212269,0.28114,0.091,,
"""25%""",,54.907283,0.350098,0.198,,
"""50%""",,118.836135,0.361871,0.235,,
"""75%""",,198.902674,0.375607,0.294,,
"""max""","""*c1sc(-c2cc(CCCCCCCCCC)c(*)s2)…",421.982243,0.525164,1.59,,


In [13]:
extra_df.write_csv('extra_train.csv')

# Extract extra data from previous merge

In [40]:
import polars as pl
from rdkit import Chem
from typing import Optional


def canonicalise_smiles(smiles: str) -> Optional[str]:
    """Return RDKit canonical SMILES (isomeric) or None on parse failure."""
    molecule = Chem.MolFromSmiles(smiles)
    return None if molecule is None else Chem.MolToSmiles(
        molecule, canonical=True, isomericSmiles=True
    )


# def load_with_canonical(path: str) -> pl.DataFrame:
#     """
#     Read a CSV that has a 'SMILES' column, add a canonicalised version,
#     and drop any rows RDKit could not parse.
#     """
#     return (
#         pl.read_csv(path)
#         .with_columns(
#             pl.col("SMILES")
#             .map_elements(canonicalise_smiles, return_dtype=pl.String)
#             .alias("SMILES_CAN")
#         )
#         .drop_nulls("SMILES_CAN")
#     )


# ---------- load the two datasets ----------
train_df = load_and_canonicalise("train.csv")
tc_only_df = load_and_canonicalise("../from_dmitry/host_tc-natsume_full-dmitry.csv")

# ---------- filter out rows whose canonical SMILES already appear in train ----------
filtered_tc_only_df = tc_only_df.filter(
    ~pl.col("SMILES").is_in(train_df.get_column("SMILES"))
)

# ---------- drop the helper column and save ----------
filtered_tc_only_df = filtered_tc_only_df.select(['SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg'])
filtered_tc_only_df.write_csv("train_tc-natsume_full-dmitry_extra.csv")

Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  filtered_tc_only_df = tc_only_df.filter(


In [41]:
filtered_tc_only_df.head()

SMILES,Tg,FFV,Tc,Density,Rg
str,f64,f64,f64,f64,f64
"""Cc1ccc(NC=S)cc1""",,,,1.342,
"""FC(F)(F)C(Cl)Cl""",-214.15,,,,
"""*CCC(C)(C)CC(C)CNC(=O)CCCCC(=O…",54.82,,,,
"""*c1ccc(-c2ccc(-c3ccc(C(*)c4c(F…",,,0.291,,
"""c1ccc(C2=C(c3ccccc3)CCCC2)cc1""",-43.15,,,,


In [42]:
filtered_tc_only_df.describe()

statistic,SMILES,Tg,FFV,Tc,Density,Rg
str,str,f64,f64,f64,f64,f64
"""count""","""1288""",644.0,0.0,129.0,534.0,0.0
"""null_count""","""0""",644.0,1288.0,1159.0,754.0,1288.0
"""mean""",,18.681626,,0.258497,1.181622,
"""std""",,121.725003,,0.152331,0.223516,
"""min""","""*.*C/C=C/CC.*CC(*)C#N.*CCC(C*)…",-255.15,,0.091,0.752,
"""25%""",,-74.65,,0.198,1.002,
"""50%""",,29.0,,0.235,1.152,
"""75%""",,78.110617,,0.294,1.342,
"""max""","""c1csc(-c2cccs2)c1""",442.63,,1.59,1.982,


# Check for host supplement vs. leak overlap

In [43]:
host_smiles = set(extra_df['SMILES'].to_list())
leak_smiles = set(filtered_tc_only_df['SMILES'].to_list())

print('Host supplemental SMILES count:', len(host_smiles))
print('Leak SMILES count:', len(leak_smiles))
print('Shared SMILES count:', len(host_smiles.intersection(leak_smiles)))

Host supplemental SMILES count: 999
Leak SMILES count: 1288
Shared SMILES count: 180


In [44]:
extra_df.join(filtered_tc_only_df, on='SMILES', how='inner')

SMILES,Tg,FFV,Tc,Density,Rg,Tg_right,FFV_right,Tc_right,Density_right,Rg_right
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""*c1ccc(-c2ccc(-c3ccc(C(*)c4c(F…",,,0.291,,,,,0.291,,
"""*CCCCCc1ccc(CCCCCOC(=O)c2ccc(C…",,,0.276,,,,,0.276,,
"""*c1ccc(Oc2ccc(-c3csc(/N=C/c4cc…",,,0.253,,,,,0.253,,
"""*CCCCc1nc2cc(NC(=NO)C(=NO)Nc3c…",170.113033,,,,,170.113033,,,,
"""*CCCCCCCCCCCCCCOC(=O)c1ccc(C(=…",,,0.258,,,,,0.258,,
…,…,…,…,…,…,…,…,…,…,…
"""*CC(O)COc1c(Cl)cc(C(C)(C)c2cc(…",,0.36747,0.1485,,,,,0.1485,,
"""*CCCCOC(=O)c1cccc(-c2cccc(C(=O…",,,0.203,,,,,0.203,,
"""*CCCCCCCCCCCCCCCCOC(=O)CCCCCCC…",,0.375447,0.308,,,,,0.308,,
"""*CCc1ccc(*)c(C(=O)OC)c1""",,,0.245,,,,,0.245,,


In [45]:
display(filtered_tc_only_df.head())
extra_df.head()

SMILES,Tg,FFV,Tc,Density,Rg
str,f64,f64,f64,f64,f64
"""Cc1ccc(NC=S)cc1""",,,,1.342,
"""FC(F)(F)C(Cl)Cl""",-214.15,,,,
"""*CCC(C)(C)CC(C)CNC(=O)CCCCC(=O…",54.82,,,,
"""*c1ccc(-c2ccc(-c3ccc(C(*)c4c(F…",,,0.291,,
"""c1ccc(C2=C(c3ccccc3)CCCC2)cc1""",-43.15,,,,


SMILES,Tg,FFV,Tc,Density,Rg
str,f64,f64,f64,f64,f64
"""*Oc1ccc(/N=C/C=N/c2ccc(OC(=O)N…",,,0.349,,
"""*CC(F)(F)C1(F)CC(C(O)(C(F)(F)F…",,0.320846,0.109,,
"""*CCCCOC(=O)CCCCCCCC(=O)O*""",,,0.241,,
"""*CCCCCCCCCCc1ccc(-c2c(-c3ccccc…",,0.398266,0.21,,
"""*c1ccc(-c2ccc(-c3sc(*)c(CCCCCC…",,,0.299,,


In [46]:
combined_dataframe = pl.concat(
    [extra_df, filtered_tc_only_df],
    how="vertical",
    rechunk=True
)

# ---------- step 2: identify all numeric/label columns ----------
numeric_column_names = [
    column_name for column_name in combined_dataframe.columns if column_name != "SMILES"
]

# ---------- step 3: group by SMILES and aggregate ----------
# - `mean()` skips nulls, so…
#   * if *both* data-frames supply a value → the mean is stored
#   * if only one data-frame supplies a value → that value is kept
#   * if neither supplies a value → result is null
deduplicated_dataframe = (
    combined_dataframe
    .group_by("SMILES", maintain_order=False)
    .agg(
        [pl.col(column_name).mean().alias(column_name) for column_name in numeric_column_names]
    )
)


display(deduplicated_dataframe.head())
deduplicated_dataframe.describe()

SMILES,Tg,FFV,Tc,Density,Rg
str,f64,f64,f64,f64,f64
"""*Oc1cccc(NC(=O)c2ccc(P(=O)(c3c…",,0.36862,,,
"""*Oc1cc(OC(=O)c2ccc(OCC(C)CC)cc…",,0.357302,,,
"""CCCCc1ccccc1""",-144.15,,,,
"""*C(=O)Nc1ccc(Oc2cccc(NC(=O)c3c…",,0.362688,,,
"""*OS(=O)(=O)c1ccc(*)cc1""",140.64,,,,


statistic,SMILES,Tg,FFV,Tc,Density,Rg
str,str,f64,f64,f64,f64,f64
"""count""","""2107""",644.0,862.0,129.0,534.0,0.0
"""null_count""","""0""",1463.0,1245.0,1978.0,1573.0,2107.0
"""mean""",,18.681626,0.365336,0.258497,1.181622,
"""std""",,121.725003,0.024155,0.152331,0.223516,
"""min""","""*.*C/C=C/CC.*CC(*)C#N.*CCC(C*)…",-255.15,0.28114,0.091,0.752,
"""25%""",,-74.65,0.350098,0.198,1.002,
"""50%""",,29.0,0.361871,0.235,1.152,
"""75%""",,78.110617,0.375607,0.294,1.342,
"""max""","""c1csc(-c2cccs2)c1""",442.63,0.525164,1.59,1.982,


In [48]:
deduplicated_dataframe.shape

(2107, 6)

In [47]:
deduplicated_dataframe.write_csv('train_host_plus_leaks_extra.csv')