# Load host data

In [1]:
import pandas as pd

comp_train_df = pd.read_csv('../from_host/train.csv')
comp_train_df.head(3)

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,


In [2]:
# Define all target properties
targets = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

# Count of non-NaN values for each target column
comp_train_df[targets].count()

Tg          511
FFV        7030
Tc          737
Density     613
Rg          614
dtype: int64

In [3]:
len(comp_train_df['SMILES'].unique()), len(comp_train_df)

(7973, 7973)

# Load extra data

In [4]:
extra_tg_df = pd.read_csv('Tg_SMILES_class_pid_polyinfo_median.csv')
display(extra_tg_df.head(3))

extra_tc_df = pd.read_csv('Tc_SMILES.csv')
display(extra_tc_df.head(3))

Unnamed: 0,SMILES,PID,Polymer Class,Tg
0,*C*,P010001,Polyolefins,-54.0
1,*CC(*)C,P010002,Polyolefins,-3.0
2,*CC(*)CC,P010003,Polyolefins,-24.1


Unnamed: 0,TC_mean,SMILES
0,0.2445,*CC(*)C
1,0.225333,*CC(*)CC
2,0.246333,*CC(*)CCC


# Merge

In [5]:
# Prepare extra_tg_df
extra_tg_clean = extra_tg_df[['SMILES', 'PID', 'Tg']].rename(columns={'PID': 'id'})
extra_tg_clean[['FFV', 'Tc', 'Density', 'Rg']] = float('nan')

# Prepare extra_tc_df  
extra_tc_clean = extra_tc_df[['SMILES', 'TC_mean']].rename(columns={'TC_mean': 'Tc'})
extra_tc_clean['id'] = range(len(comp_train_df) + len(extra_tg_df), len(comp_train_df) + len(extra_tg_df) + len(extra_tc_df))
extra_tc_clean[['Tg', 'FFV', 'Density', 'Rg']] = float('nan')

# Reorder columns to match train_df
extra_tg_clean = extra_tg_clean[['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg']]
extra_tc_clean = extra_tc_clean[['id', 'SMILES', 'Tg', 'FFV', 'Tc', 'Density', 'Rg']]

# Combine all datasets into train_df
# train_df = pd.concat([comp_train_df, extra_tg_clean, extra_tc_clean], ignore_index=True)
train_df = pd.concat([comp_train_df, extra_tc_clean], ignore_index=True)

print(train_df[targets].count())

Tg          511
FFV        7030
Tc         1611
Density     613
Rg          614
dtype: int64


In [6]:
display(train_df.head())
display(train_df.tail())

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
8842,16050,FC(F)(F)C(C1=CC2=C(OC(=N2)C2=CC=C(OC3=CC=C(OC4...,,,0.155,,
8843,16051,FC(F)(F)C(C1=CC2=C(OC(=N2)C2=CC=C(OC3=C4C5C6=C...,,,0.109,,
8844,16052,FC(F)(F)C(C1=CC2=C(OC(=N2)C2=CC=C(OC3=C4C5C6=C...,,,0.176,,
8845,16053,*C1=NC2=CC(=CC=C2N1)C1=CC2=C(NC(O2)C2=CC=C(OC3...,,,0.252,,
8846,16054,*C1=NC2=CC(=CC=C2N1)C1=CC2=C(NC(O2)C2=CC=C(OC3...,,,0.18,,


In [7]:
len(train_df['SMILES'].unique()), len(train_df)

(8103, 8847)

In [8]:
import numpy as np

def first_non_missing(series: pd.Series) -> float | str | np.number | None:
    """Return the first non-NaN value in *series*, else NaN."""
    for value in series:
        if pd.notna(value):
            return float(value)
    return np.nan

# --- build the aggregation dictionary automatically -----------------------
columns_to_aggregate: list[str] = [                   # every column except the key
    column_name
    for column_name in train_df.columns
    if column_name != "SMILES"
]

aggregation_rules: dict[str, callable] = {
    "id": "first",                                    # keep the first id (or change as needed)
    **{column_name: first_non_missing for column_name in columns_to_aggregate
       if column_name != "id"}
}

# --- perform the merge ----------------------------------------------------
deduplicated_train_df: pd.DataFrame = (
    train_df
    .groupby(by="SMILES", dropna=False)               # one group per distinct SMILES
    .agg(aggregation_rules)
    .reset_index()                                    # restore SMILES as a normal column
)

print(len(deduplicated_train_df))          # → 10027  (matches len(train_df['SMILES'].unique()))
deduplicated_train_df.sample(10)

8103


Unnamed: 0,SMILES,id,Tg,FFV,Tc,Density,Rg
4896,*Nc1ccc([C@H](CCC)c2ccc(C(CCCC)c3ccc([C@@H](CC...,84183521,,0.369242,,,
683,*C(Cl)=C(*)c1ccccc1,1003416473,,0.41584,,,
5503,*O[Si](*)(CCCOCCOCCOC)CCCOCCOCCOC,895302963,,0.371665,,,
794,*C1CCC(CC2CCC(N3C(=O)C4C5C=CC(C6C(=O)N(*)C(=O)...,1012615960,,0.378573,,,
6514,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C=Nc4ccc(Oc5ccc(...,114257024,,0.373381,,,
6798,*c1cc(CCCCCCCCCCCCCCCCC)c(*)s1,1066470436,,,0.396,,
6915,*c1ccc(-c2ccc(C(*)(C)C(F)(F)F)cc2)cc1,1033278937,,,0.194,1.086916,21.060561
8003,*c1cccc(NC(=O)CCCCCCC(=O)Nc2cccc(S(*)(=O)=O)c2)c1,1476946894,,0.344559,,,
5969,*Oc1ccc(C2(c3ccc(OC(=O)CCC(*)=O)cc3)c3cc(OC)cc...,1815199784,,0.362618,,,
3767,*Cc1ccc(CN(C)C(=O)CCCCCCCCCCCCCCC(=O)N(*)C)cc1,1504708152,-14.323625,,,,


In [9]:
deduplicated_train_df.count()

SMILES     8103
id         8103
Tg          511
FFV        7030
Tc          867
Density     613
Rg          614
dtype: int64

In [10]:
import polars as pl

# pl.from_pandas(deduplicated_train_df).write_csv('train_merged.csv')
deduplicated_train_df.to_csv('train_Tc-only_merged.csv', index=False)

# Check types

In [11]:
import polars as pl

reloaded_df = pl.read_csv('train_merged.csv', infer_schema_length=10000)
display(reloaded_df.head())
display(reloaded_df.describe())

SMILES,id,Tg,FFV,Tc,Density,Rg
str,str,f64,f64,f64,f64,f64
"""*/C(=C(/*)c1ccc(C(C)(C)C)cc1)c…","""P332014""",200.0,,,,
"""*/C(=C(/*)c1ccc(CCCC)cc1)c1ccc…","""P332013""",200.0,,,,
"""*/C(=C(/*)c1ccc(Oc2ccccc2)cc1)…","""P332018""",200.0,,,,
"""*/C(=C(/*)c1ccc([Si](C(C)C)(C(…","""P332113""",250.0,,,,
"""*/C(=C(/*)c1ccc([Si](C)(C)C)cc…","""P332015""",200.0,,,,


statistic,SMILES,id,Tg,FFV,Tc,Density,Rg
str,str,str,f64,f64,f64,f64,f64
"""count""","""10027""","""10027""",7679.0,7030.0,867.0,613.0,614.0
"""null_count""","""0""","""0""",2348.0,2997.0,9160.0,9414.0,9413.0
"""mean""",,,138.980965,0.367212,0.256539,0.985484,16.419787
"""std""",,,112.687531,0.029609,0.101271,0.146189,4.60864
"""min""","""*/C(=C(/*)c1ccc(C(C)(C)C)cc1)c…","""100028701""",-148.029738,0.2269924,0.0465,0.748691,9.7283551
"""25%""",,,52.0,0.349546,0.186333,0.890243,12.531907
"""50%""",,,130.0,0.36427,0.236,0.948193,15.064002
"""75%""",,,229.0,0.380796,0.325,1.062096,20.429383
"""max""","""FC(F)(F)C(C1=CC2=C(OC(=N2)C2=C…","""P522048""",495.0,0.777097,1.59,1.840999,34.672906


In [12]:
reloaded_df['FFV'].unique()

FFV
f64
""
0.2269924
0.237637
0.245613
0.245654
…
0.6509068
0.687057
0.769315
0.774066


In [13]:
reloaded_df.tail()

SMILES,id,Tg,FFV,Tc,Density,Rg
str,str,f64,f64,f64,f64,f64
"""CCNC(=O)OCCCCC(*)=C=C=C(*)CCCC…","""15861""",,,0.26,,
"""Cc1ccc(cc1)S(=O)(=O)OCCCCC(*)=…","""15860""",,,0.21,,
"""FC(F)(F)C(C1=CC2=C(OC(=N2)C2=C…","""16051""",,,0.109,,
"""FC(F)(F)C(C1=CC2=C(OC(=N2)C2=C…","""16052""",,,0.176,,
"""FC(F)(F)C(C1=CC2=C(OC(=N2)C2=C…","""16050""",,,0.155,,
