In [2]:
import os
import pandas as pd

In [3]:
file_path = os.path.join("data", "StandardizedTox21Results.xlsx")
df = pd.read_excel(file_path)

In [4]:
# 3. Assign ligand codes: lig0001, lig0002, ...
df = df.reset_index(drop=True)
df["LigandCode"] = df.index + 1
df["LigandCode"] = df["LigandCode"].apply(lambda i: f"lig{i:04d}")

In [5]:
from rdkit import Chem
from rdkit.Chem import Descriptors

df["MolWt"] = df["std_SMILES"].apply(lambda x: Descriptors.MolWt(Chem.MolFromSmiles(x)))
df["MolWt"].quantile( [0.1, 0.25, 0.5, 0.75, 0.9] )

0.10    122.16900
0.25    162.22900
0.50    232.30500
0.75    330.46875
0.90    424.46800
Name: MolWt, dtype: float64

In [6]:
# get 0.1 and 0.9 quantile
lower_bound = df["MolWt"].quantile(0.1)
upper_bound = df["MolWt"].quantile(0.9)

In [8]:
out_dir = "AutodockGPU"

os.makedirs(out_dir, exist_ok=True)

# 5. Write ligands.smi: each line = "<SMILES> <LigandCode>"
smi_path = os.path.join(out_dir, "ligands.smi")

n_entries = 0
with open(smi_path, "w") as fh:
    for _, row in df.iterrows():
        
        if row["MolWt"] < lower_bound or row["MolWt"] > upper_bound:
            continue
        
        smi = row["std_SMILES"]
        code = row["LigandCode"]
        fh.write(f"{smi} {code}\n")
        
        n_entries += 1
        
print(f"Wrote {n_entries} entries to {smi_path}")

# 6. Write out the ligand info Excel with the new code column
info_path = os.path.join(out_dir, "LigandInfo.xlsx")
df.to_excel(info_path, index=False)

print(f"Wrote entries to {info_path}")

Wrote 6028 entries to AutodockGPU/ligands.smi
Wrote entries to AutodockGPU/LigandInfo.xlsx


- among 6,028 ligands, 6,022 ligands are written as `.pdbqt`.