### Calculate Morgan FP for the PRIME virtual library and save to CSV

In [None]:
import random

import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors, PandasTools

In [None]:
df = pd.read_csv("../data/Data S4.csv")
df.head()

In [None]:
# calculate Morgan fingerprints
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smi), radius=3, nBits=1024).ToBitString() 
       for smi in df["product"]]
df["MorganFP"] = fps

In [None]:
# calculate MACCSKeys fingerprints
fps = [rdMolDescriptors.GetMACCSKeysFingerprint(Chem.MolFromSmiles(smi)).ToBitString() 
       for smi in df["product"]]
df["MACCSKeysFP"] = fps

In [None]:
df.head()

In [None]:
df.to_csv("../data/dataS4_with_fps.csv.bz2", index=False)

### Calculate MorganFP for the Enamine hit locator library and save to CSV

In [None]:
# import Enamine hit locator library
df = PandasTools.LoadSDF("../data/Enamine_Hit_Locator_Library_HLL-460_460160cmpds_20220221.sdf")
df.head()

In [None]:
# calculate Morgan fingerprints
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024).ToBitString() 
       for mol in df["ROMol"]]
df["MorganFP"] = fps

In [None]:
df[["Catalog ID", "MorganFP"]].to_csv("../data/Enamine_Hit_Locator_Library_HLL-460_460160cmpds_20220221_with_fps.csv")

### Calculate MorganFP for FDA approved drugs and save to CSV

In [None]:
# import Enamine FDA-approved library
df = PandasTools.LoadSDF("../data/Enamine_FDA_approved_Drugs_1123cmpds_20231109.sdf")
df.head()

In [None]:
# calculate Morgan fingerprints
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024).ToBitString() 
       for mol in df["ROMol"]]
df["MorganFP"] = fps

In [None]:
df.to_csv("../data/Enamine_FDA_with_fps.csv.bz2", index=False)

### Calculate MorganFP for chembl and save to CSV

In [None]:
# chembl is huge...we load only 50k random ones
supplier = Chem.ForwardSDMolSupplier("../data/chembl_34.sdf")
chembl_count = sum(1 for mol in supplier if mol is not None)


In [None]:
random_indices = set(random.sample(range(chembl_count), 50000))
supplier = Chem.ForwardSDMolSupplier("../data/chembl_34.sdf")
selected_molecules = []
for idx, mol in enumerate(supplier):
    if mol is None:
        continue  # skip invalid molecules
    if idx in random_indices:
        selected_molecules.append(mol)
    if len(selected_molecules) == chembl_count:
        break

In [None]:
# calculate Morgan fingerprints
fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024).ToBitString() 
       for mol in selected_molecules]

In [None]:
# calculate MACCSKeys fingerprints
maccs_fps = [rdMolDescriptors.GetMACCSKeysFingerprint(mol).ToBitString() 
       for mol in selected_molecules]

In [None]:
pd.DataFrame({"MorganFP": fps, "MACCSKeysFP": maccs_fps}).to_csv("../data/chembl_34_50k-random_with_FP.csv", index=False)