# Map RefChemDB to standardized INCHIkey

In [1]:
# use environment smiles with python 3.11

import polars as pl
import pandas as pd
from smiles.standardize_smiles import StandardizeMolecule

In [2]:
dat = pl.read_csv("./data/RefChemDB_DTXSID.csv")

nrow = dat.shape[0]
rows_per_file = 10000

for i in range(0, nrow, rows_per_file):
    chunk = dat[i:i + rows_per_file]
    chunk.write_csv(f"./output/output_chunk_{i // rows_per_file + 1}.csv")

In [12]:
# read in smiles
smiles = []
for i in range(1, 5):
    chunk = pl.read_csv(f"./output/smiles_chunk_{i}.csv")
    smiles.append(chunk)

smiles = pl.concat(smiles, how="vertical").rename({"SMILES": "smiles"})
smiles = smiles.filter(pl.col("smiles") != "N/A")
smiles.write_csv("./output/compiled_smiles.csv")
smiles = pd.read_csv("./output/compiled_smiles.csv")

In [None]:
for i in range(0, smiles.shape[0], 250):
    chunk = smiles.iloc[i:i + 250]

    try:
        standardizer = StandardizeMolecule(
            input=chunk,
            num_cpu=70,
            augment=True
        )

        batch = standardizer.run()
        batch.to_csv(f"./output/standardized_smiles_{i // 250 + 1}.csv")
    except Exception as e:
        print(f"An error occurred: {e}")
        chunk.to_csv(f"./output/failed_smiles_{i // 250 + 1}.csv")