In [1]:
import polars as pl

smiles_df = pl.read_parquet('realistic/*.parquet')
smiles_df.head()

smiles,Egc,Egb,Eib,CED,Ei,Eea,nc,ne,epse_6.0,epsc,epse_3.0,epse_1.78,epse_15.0,epse_4.0,epse_5.0,epse_2.0,epse_9.0,epse_7.0,TSb,TSy,epsb,YM,permCH4,permCO2,permH2,permO2,permN2,permHe,Eat,rho,LOI,Xc,Xe,Cp,Td,Tg,Tm,predicted_probability
str,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64
"""[*]C1=CC=C(c2sc(NC(=O)c3ccc(-c…",2.289044,1.959191,2.962373,122.920807,5.214754,2.572867,2.184524,1.748937,5.008382,5.741158,5.555241,5.886546,3.354288,5.365005,5.191479,5.805032,4.382617,4.820214,64.46814,50.633636,9.441549,1583.442139,2.403615,22.144337,34.048382,5.849916,1.944478,30.31716,-6.069649,1.281736,25.752581,24.282017,27.356392,1.202939,588.338684,406.228516,487.551941,0.00669
"""[*]Nc1ccc(C(=O)NNC(=O)CCCCCNC(…",4.457177,4.093316,3.778311,103.913528,6.053351,1.554214,1.823484,1.566183,3.434718,3.940835,3.667161,3.783423,2.577688,3.600048,3.528842,3.756659,3.094108,3.331819,80.766747,69.99189,29.903227,1695.704712,0.508363,2.040609,5.583228,0.762137,0.364979,5.948467,-6.238371,1.119977,20.957157,19.040171,20.143164,1.185691,680.781311,445.641022,567.730408,0.036418
"""[*]N=P([*])(Oc1ccc(OC)cc1)Oc1c…",4.002379,3.684302,4.006617,103.345268,5.51034,1.498786,1.839561,1.581623,3.275388,4.063346,3.572173,3.773438,2.502095,3.463676,3.373869,3.731973,2.937604,3.167282,42.379631,46.022163,7.68101,1776.890137,3.701518,21.418957,25.401615,5.782984,2.249864,21.666817,-5.825336,1.240532,30.226385,48.125187,46.957031,1.444203,603.768005,263.517029,467.632233,0.116284
"""[*]C(=O)Oc1ccc(S(=O)(=O)c2ccc(…",3.804986,3.403906,3.643408,126.647659,5.926083,2.314558,2.055537,1.688373,3.765182,4.938564,4.138022,4.46568,2.743561,3.994742,3.884886,4.39158,3.401808,3.658372,71.267509,68.444504,7.920873,2435.362305,0.192803,4.101837,11.470147,1.13232,0.203957,11.223465,-6.07989,1.297282,31.084461,43.101723,36.364712,1.190731,753.817505,532.404602,620.747925,0.010928
"""[*]OCC(O)CSCCCCCOC(=O)O[*]""",6.015775,5.958481,4.267887,115.884056,7.164084,1.192576,1.653251,1.466067,2.853437,3.398266,3.191202,3.260525,2.2721,3.088587,2.972714,3.254953,2.556636,2.738564,21.46876,16.394579,66.769371,469.900146,0.052352,0.551606,1.280889,0.132076,-0.032265,1.589315,-5.744719,1.135415,20.910528,32.21822,31.965536,1.426685,586.757935,249.554382,366.177521,0.035381


In [2]:
smiles_df.shape

(157923, 39)

In [5]:
from rdkit import Chem
from tqdm import tqdm

smiles_subset = smiles_df.sample(50_000, seed=42)['smiles'].to_list()

reformatted_smiles = []
for smi in tqdm(smiles_subset):
    mol = Chem.MolFromSmiles(smi)
    reformatted_smiles.append(Chem.MolToSmiles(mol))

with open('realistic_50k.csv', 'w') as f:
    f.write('SMILES\n' + '\n'.join(reformatted_smiles))

100%|██████████| 50000/50000 [00:22<00:00, 2207.73it/s]


In [7]:
def standardize_smiles(smi: str) -> str | None:
    if smi is None:
        return None
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    return Chem.MolToSmiles(mol)

reformatted_subset_df = (
    smiles_df
    .sample(50_000, seed=42)
    .rename({"smiles": "SMILES"})  # rename column
    .with_columns(
        pl.col("SMILES").map_elements(standardize_smiles, return_dtype=pl.String)
    )  # standardize SMILES formatting
    .drop("predicted_probability")  # drop unwanted column
)

reformatted_subset_df.write_csv('realistic_50k_full.csv', include_header=True)