In [5]:
import pandas as pd
from rdkit.Chem import PandasTools
import os

## 1. Download hits from Fragalysis
## 2. Prepare `templates` folder by moving the **apo-desolv.pdbs there
## 3. Prepare master CSV from SDF

In [4]:
# prepare master csv
orig = PandasTools.LoadSDF('scaffolds_354.sdf')
rows = []
for i, row in orig.iterrows():
    if i == 0:
        continue
    hits: list[str] = row['ref_mols'].split(',')
    if len(hits) > 2:
        print(f'{i} has more than 2 hits')
    new_row = {
        'smiles': row['original SMILES'],
        'compound_set': row['ID'],
        'template': row['ref_pdb']
    }
    for i, hit in enumerate(hits):
        new_row[f'hit{i+1}'] = hit
    rows.append(new_row.copy())
df = pd.concat([pd.DataFrame(rows)])
df.to_csv('syndirella_master.csv', index=False)
df

Unnamed: 0,smiles,compound_set,template,hit1,hit2
0,Cn1nccc1C(=O)NCc1csc(-c2ncn[nH]2)n1,CHIKVMac-DLS-JA001,cx0281a,cx0892a,cx0281a
1,Cc1nsc(N[C@@H]2CCC[C@H]2c2csc(-c3nc[nH]n3)n2)n1,CHIKVMac-DLS-JA002,cx0281a,cx0406a,cx0281a
2,CCc1nc(NC(=O)c2ccnc(NC3CC=CC3)c2)no1,CHIKVMac-DLS-JA003,cx0300a,cx0441a,cx0316a
3,O=C1CC[C@H](Cn2cnccc2=O)N1,CHIKVMac-DLS-JA004,cx0300a,cx0314a,cx1114a
4,O=c1ccncn1C[C@H]1CCS(=O)(=O)C1,CHIKVMac-DLS-JA005,cx0300a,cx0314a,cx1182a
...,...,...,...,...,...
349,O=C(Nc1ccccc1)c1c(CO)cc2c(c1O)OCC2,cx0969f-cx0692a,cx0270a,cx0692a,cx0969f
350,OCc1cc2c(c(Nc3ccc(O)cc3)c1O)OCC2,cx0969f-cx0935a,cx0270a,cx0935a,cx0969f
351,COCC(=O)Nc1cc(Br)cc(N)n1,cx1075a-cx0692a,cx0270a,cx0692a,cx1075a
352,COCC(=O)N[C@H]1CCCC[C@]12C(=O)NCC[C@@H]2Br,cx1076a-cx0692a,cx0270a,cx0692a,cx1076a


## 5. Save each row to seperate CSV

In [None]:
# save each row of master csv as a separate csv
for i, row in df.iterrows():
    # keep row names
    row_df = row.to_frame().T
    os.makedirs('syndirella_input', exist_ok=True)
    row_df.to_csv(f'syndirella_input/syndirella_input{i}.csv', index=True)