In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

from rdkit import Chem
from rdkit import DataStructs
# shut off warnings
from rdkit import RDLogger                                                                                                                                                               
RDLogger.DisableLog('rdApp.*')   

# Helper Functions

In [2]:
def call_hits_from_df(df, thresh):
    confirmvals = []
    for _, odvals in df.items():
        confirmed_hit = False
        for x in list(odvals):
            if x < thresh:
                confirmed_hit = True
        confirmvals.append(confirmed_hit)
    return(confirmvals)

def call_hits_from_concentration_df(rep1, rep2, thresh):
    # call a hit if highest concentration causes low OD
    rep1_od = []
    rep2_od = []
    confirmvals = []
    for i in range(len(rep1.iloc[0,:])):
        confirmed_hit = False
        highestconc_od1 = rep1.iloc[0,i]
        highestconc_od2 = rep2.iloc[0,i]
        rep1_od.append(highestconc_od1)
        rep2_od.append(highestconc_od2)
        if highestconc_od1 < thresh and highestconc_od2 < thresh:
            confirmed_hit = True
        confirmvals.append(confirmed_hit)
    return(confirmvals, rep1_od, rep2_od)

def match_mols_with_frags(moldf, moldf_smi_col, fragdf, fragdf_smi_col):
    molsmis = list(moldf[moldf_smi_col])
    mols = [Chem.MolFromSmiles(smi) for smi in molsmis]
    fragsmis = list(fragdf[fragdf_smi_col])
    frags = [Chem.MolFromSmiles(smi) for smi in fragsmis]

    matched_frag_smis = []
    matched_mol_smis = []
    # find the fragments they match to
    for i, mol in enumerate(mols):
        for j, frag in enumerate(frags):
            if mol.HasSubstructMatch(frag):
                matched_frag_smis.append(fragsmis[j]) # only works bc all smiles make valid mols
                matched_mol_smis.append(molsmis[i]) 
        
    matchdf = pd.DataFrame()
    matchdf['fragment_SMILES'] = matched_frag_smis
    matchdf['SMILES'] = matched_mol_smis
    return(matchdf)

def report_stats(df, smi_col = 'SMILES', hit_col = 'Confirmed-Hit'):
    # smi_col is mol smi col
    dedup = df.drop_duplicates(smi_col)
    length = len(dedup)
    hits = sum([1.0 if x == True else 0.0 for x in list(dedup[hit_col])])
    print('Number of compounds tested: ' + str(length))
    print('Number of confirmed hits: ' + str(int(hits)))
    print('Hit rate: ' + str(100 * np.round(float(hits)/length,4)) + '%')

def match_molport_mols_with_diff_smis(smilist1, smilist2):
    # find the fragments they match to
    corr_smilist1 = [] # expect these are molport smis
    corr_smilist2 = [] # expect these are our smis
    for i, smi1 in enumerate(smilist1):
        for j, smi2 in enumerate(smilist2):
            if type(smi2) == float:
                continue
            mol1 = Chem.MolFromSmiles(smi1)
            mol2 = Chem.MolFromSmiles(smi2)
            if mol2 is None:
                continue
            if smi2 in ['FC(F)(F)c1cccc(c1)C(=O)c1c[nH]c(c1)C(=O)NC[C@H]1CCCO1', 'CC1=NOC(=C1NC(=O)NC1=CC=C(Cl)C=C1)C1=CC=C(C=C1)C(F)(F)F', 'OC(=O)C[C@H]1CC[C@@H](CC1)C1=CC=C(NC(=O)C2=NN=C(NC3=CC(F)=C(F)C=C3)O2)C=C1']:
                continue # these are compounds w 2 matching compounds from our metadata list so we looked at the stereochemistry and picked the most similar
                        
            tan = DataStructs.TanimotoSimilarity(Chem.RDKFingerprint(mol1), Chem.RDKFingerprint(mol2))
            if tan >= 1.0:
                corr_smilist1.append(smi1)
                corr_smilist2.append(smi2)
    return(corr_smilist1, corr_smilist2)

# SA-1

In [3]:
# from the first experiment - all hits surveyed
rd1_first_exps = pd.read_excel('../data/experimental_validation/sa_rd1.xlsx') # aarti did not save MIC data for these
rd1_first_exps = rd1_first_exps[['Broad ID', 'SMILES', 'Name', 'Rep1', 'Rep2']]
rd1_first_exps = rd1_first_exps.iloc[0:42,:]
rd1_first_exps.columns = ['Broad_ID', 'SMILES', 'Name', 'Rep1-OD', 'Rep2-OD']
rd1_first_exps['Hit-OD'] = [x < 0.3 and y < 0.3 for x,y in zip(rd1_first_exps['Rep1-OD'], rd1_first_exps['Rep2-OD'])]

# hardcode the extraction of the CONFIRMED or NOT ODs with second round of validation
rd1 = pd.read_excel('../data/experimental_validation/sa_rd1_addtnl.xlsx')
cpd_smiles = list(rd1.iloc[4,2:26]) # first get names / labels
sa_rep = rd1.iloc[6:14,2:26].reset_index(drop = True) # get OD values
    
# call a hit if any concentration causes OD < 0.3, using this second round validaton
confirmdf = pd.DataFrame()
confirmdf['SMILES'] = cpd_smiles
confirmdf['Confirmed-Hit'] = call_hits_from_df(sa_rep, thresh = 0.3)
rd1_first_exps = rd1_first_exps.merge(confirmdf, on = 'SMILES', how = 'left')

# finally add in the fragment that we used
predfrags = pd.read_csv('../out/fragment_algorithm_pipeline_runs/06_SA_rd1/candidates_after_matching_and_filtering.csv')
matchdf = match_mols_with_frags(rd1_first_exps, 'SMILES', predfrags, 'fragment_SMILES')
rd1_first_exps = rd1_first_exps.merge(matchdf, on = 'SMILES', how = 'left')

# now add data to a running dataframe
rd1_first_exps['Round'] = ['Round1-Pilot'] * len(rd1_first_exps)
rd1_first_exps['ID'] = rd1_first_exps['Broad_ID']
columns = ['Round', 'ID', 'Name', 'SMILES', 'fragment_SMILES','Rep1-OD', 'Rep2-OD', 'Hit-OD', 'Confirmed-Hit']
rd1_first_exps = rd1_first_exps[columns]
fulldf = pd.DataFrame(rd1_first_exps)

print('Round 1 Statistics')
report_stats(rd1_first_exps)

Round 1 Statistics
Number of compounds tested: 42
Number of confirmed hits: 19
Hit rate: 45.24%


# SA-2

In [4]:
# annoying our smiles don't match the smiles from molport, so have to merge the data
molport_order_manual = pd.read_excel('../data/experimental_validation/sa_rd2.xlsx')
molport_order_manual = molport_order_manual.iloc[0:16,:]
metadata = pd.read_excel('../../generativeML/out/pipeline_v5_script/frag_0.05_17atom_800K_and_5mil/final_proposed_molecules_to_order_ANNOTATED_09_29_2022.xlsx')
molportsmis = list(molport_order_manual['SMILES'])
oursmis = list(metadata['SMILES'])
match_molportsmis, match_oursmis = match_molport_mols_with_diff_smis(molportsmis, oursmis)
metadata = metadata[[x in match_oursmis for x in oursmis]]

# hardcode the extraction of the ODs and HEP/HEK growth from the second round concentration data
rd2 = pd.read_excel('../data/experimental_validation/sa_rd2_addtnl.xlsx')
cpd_smiles = list(rd2.iloc[2,2:10]) # first get names / labels
cpd_names = list(rd2.iloc[3,2:10])
sa_rep1 = rd2.iloc[5:13,2:10].reset_index(drop = True) # get OD values
sa_rep2 = rd2.iloc[5:13,12:20].reset_index(drop = True)

# call a hit if any concentration causes OD < 0.3, using this second round validaton
confirmdf = pd.DataFrame()
confirm_hits, rep1_od, rep2_od = call_hits_from_concentration_df(sa_rep1, sa_rep2, thresh = 0.3)
confirmdf['Name'] = cpd_names
confirmdf['Rep1-OD'] = rep1_od
confirmdf['Rep2-OD'] = rep2_od
confirmdf['Confirmed-Hit'] = confirm_hits
molport_order_manual = molport_order_manual.merge(confirmdf, on = 'Name', how = 'left')
molport_order_manual['Hit-OD'] = [bool(x) for x in list(molport_order_manual['OD-Hit'])]

# finally add in the fragment that we used
predfrags = pd.read_csv('../../generativeML/out/pipeline_v5_script/frag_0.05_17atom_800K_and_5mil/more_info_candidates_v1_08_16_2022.csv')
matchdf = match_mols_with_frags(molport_order_manual, 'SMILES', predfrags, 'fragment_SMILES')
molport_order_manual = molport_order_manual.merge(matchdf, on = 'SMILES', how = 'left')

# now add data to a running dataframe
molport_order_manual['Round'] = ['Round2-17atom'] * len(molport_order_manual)
molport_order_manual['ID'] = molport_order_manual['Molport_ID']
molport_order_manual = molport_order_manual[columns]
fulldf = fulldf.append(molport_order_manual)

print('Round 2 Statistics')
report_stats(molport_order_manual)

Round 2 Statistics
Number of compounds tested: 16
Number of confirmed hits: 8
Hit rate: 50.0%


  fulldf = fulldf.append(molport_order_manual)


# SA-3

In [5]:
# we do not have OD data on this - only have MIC on the hits from here
rd3 = pd.read_csv('../../generativeML/out/pipeline_v5_script/frag_0.05_enamine18mil_mol_0.15_800K/final_proposed_and_annotated_mols_to_order_10_21_2022.csv')
rd3['ID'] = rd3['Name']

# positive hits reported by aarti
confirmedhits = ['BRD-A29973139', 'BRD-A19217117', 'BRD-A80110716', 'BRD-K32533226']
rd3['Confirmed-Hit'] = [x in confirmedhits for x in list(rd3['Name'])]

# combine with fragments data
predfrags = pd.read_csv('../../generativeML/out/pipeline_v5_script/frag_0.05_enamine18mil_mol_0.15_800K/more_info_candidates_v1_10_13_2022.csv')
matchdf = match_mols_with_frags(rd3, 'SMILES', predfrags, 'fragment_SMILES')
rd3 = rd3.merge(matchdf, on = 'SMILES', how = 'left')

# now add data to a running dataframe
rd3['Round'] = ['Round3-Enamine'] * len(rd3)
pared_down_cols = [col for col in columns if col not in ['Rep1-OD', 'Rep2-OD', 'Hit-OD']]
rd3 = rd3[pared_down_cols]
fulldf = fulldf.append(rd3)

print('Round 3 Statistics')
report_stats(rd3)

Round 3 Statistics
Number of compounds tested: 51
Number of confirmed hits: 4
Hit rate: 7.84%


  fulldf = fulldf.append(rd3)


# SA-4

In [6]:
# get combined data for round 4 and round 5
rd45 = pd.read_excel('../data/experimental_validation/sa_rd4_rd5_addtnl.xlsx')
labels = ['Round4-Enamine Fragments+Broad'] * 14
labels.extend(['Round5-GDB11,17+Broad'] * 17)
rd45['Round'] = labels
rd45['Rep1-OD'] = rd45['SA at 100uM']
rd45['ID'] = rd45['Name']
rd45 = rd45[['Round', 'ID', 'Name', 'SMILES', 'Rep1-OD']]
rd45['Hit-OD'] = [x < 0.3 for x in list(rd45['Rep1-OD'])]

# combine with fragments data - round 4
round4 = rd45[rd45['Round'] == 'Round4-Enamine Fragments+Broad']
rd4preds = pd.read_csv('../../generativeML/out/pipeline_v6_script/frag_0.05_allfrags_mol_0.15_800K/more_info_candidates_v1_12_16_2022.csv')
matchdf = match_mols_with_frags(round4, 'SMILES', rd4preds, 'fragment_SMILES')
round4 = round4.merge(matchdf, on = 'SMILES', how = 'left')

# add to running df
fulldf = fulldf.append(round4)
print('Round 4 Statistics')
report_stats(round4, hit_col = 'Hit-OD')

Round 4 Statistics
Number of compounds tested: 14
Number of confirmed hits: 4
Hit rate: 28.57%


  fulldf = fulldf.append(round4)


# SA-5

In [7]:
round5 = rd45[rd45['Round'] == 'Round5-GDB11,17+Broad']
# combine with fragments data - round 5
rd5preds = pd.read_csv('../../generativeML/out/pipeline_v6_script/frag_0.05_allfrags_mol_0.15_800K_redo_with_gdb11_and_17_fix_mistake/more_info_candidates_v1_12_20_2022.csv')
matchdf = match_mols_with_frags(round5, 'SMILES', rd5preds, 'fragment_SMILES')
round5 = round5.merge(matchdf, on = 'SMILES', how = 'left')

# add to running df
fulldf = fulldf.append(round5)
print('Round 5 Statistics')
report_stats(round5, hit_col = 'Hit-OD')

Round 5 Statistics
Number of compounds tested: 17
Number of confirmed hits: 6
Hit rate: 35.29%


  fulldf = fulldf.append(round5)


In [8]:
fulldf.to_csv('../data/experimental_validation/combined_cpd_and_frag_results_up_through_round5_sa.csv', index = False)

# SA-6

In [9]:
# combine with fragments data - round 6
rd6 = pd.read_excel('../data/experimental_validation/sa_rd6.xlsx', header = 1)
rd6.columns = ['', 'Compound', 'Rep1-OD', 'Rep1-MRSA', 'Rep2-OD', 'Rep1-MRSA', 'ID', 'Formula', 'MW', 'SMILES']
rd6['Round'] = ['Round6-Enamine_frags_no_cpd'] * len(rd6)
rd6['Name'] = ['mol' + str(i) for i in list(rd6['Compound'])]
rd6['Confirmed-Hit'] = [1.0 if x < 0.2 and y < 0.2 else 0.0 for x,y in zip(rd6['Rep1-OD'], rd6['Rep2-OD'])]

# combine with fragments data - round 6
rd6preds = pd.read_csv('../../generativeML/out/pipeline_v5_script/enamine_fragments_alone_v1/fragment_clusters_over_0.3/finalmols.csv')
matchdf = match_mols_with_frags(rd6, 'SMILES', rd6preds, 'fragment_SMILES')
rd6 = rd6.merge(matchdf, on = 'SMILES', how = 'left')
rd6 = rd6[['Round', 'ID', 'Name', 'SMILES', 'fragment_SMILES', 'Rep1-OD', 'Rep2-OD', 'Confirmed-Hit']]

# save it to its own dataframe
rd6.to_csv('../data/experimental_validation/round6_cleaned_results.csv', index=False)

print('Round 6 Statistics')
report_stats(rd6)

Round 6 Statistics
Number of compounds tested: 31
Number of confirmed hits: 5
Hit rate: 16.13%


# Get just active SA fragments

In [10]:
# through round 5
df = pd.read_csv('../data/experimental_validation/combined_cpd_and_frag_results_up_through_round5_sa.csv')

# keep if only have one "Hit-OD" or "Confirmed-Hit" and that one is positive; if we have both data, only keep if Confirmed-Hit = True
keep_indices = []
index = 0
for hit, confirmed in zip(list(df['Hit-OD']), list(df['Confirmed-Hit'])):
    if hit is True and type(confirmed) is float:
        keep_indices.append(index)
    elif type(hit) is float and confirmed is True:
        keep_indices.append(index)
    elif hit is True and confirmed is True:
        keep_indices.append(index)
    index = index + 1

df = df.iloc[keep_indices]
df = df.reset_index(drop = True)
df['compound_SMILES'] = list(df['SMILES'])
df = df[['Round','ID','Name','compound_SMILES','fragment_SMILES']]
df['Compound_of_Interest'] = ['Y' if n == '8M-713' else 'N' for n in list(df['Name'])]
df.to_csv('../data/experimental_validation/successful_combined_cpd_and_frag_results_up_through_round5_sa.csv')

# NG-1

In [11]:
# get metadata for round 1 ng
rd1_first_exps = pd.read_excel('../data/experimental_validation/ng_rd1.xlsx', header = 1) # aarti did not save MIC data for these

# asked melis to do a third/fourth rep of BRD-K54582152 because of two reps being wonky
cpd_names = []
binary_hits = []
for cpd, smalldf in rd1_first_exps.groupby('Broad Sample ID'):
    dftopconc = smalldf.dropna(subset = 'Graver Wade\n_1')
    dftopconc = dftopconc.sort_values('umol/L final', ascending = False)
    dftopconc = dftopconc.iloc[0,:]
    try: 
        float(dftopconc['Graver Wade\n_1'])
    except Exception as e:
        print(e)
        display(smalldf)
    avg_GW = np.mean([dftopconc['Graver Wade\n_1'], dftopconc['Graver Wade\n_2']])
    cpd_names.append(cpd)
    binary_hits.append(1.0 if avg_GW < 60 else 0.0)

# get the SMILES metadata
df = pd.DataFrame()
df['Broad_ID'] = cpd_names
df['hit'] = binary_hits
meta = pd.read_csv('../out/fragment_algorithm_pipeline_runs/12_NG_rd1/candidate_compounds_after_matching_and_filtering_with_metadata.csv')
meta = meta[['Name', 'smiles']]
meta.columns = ['Name', 'SMILES']
df['Name'] = [x.split('-')[0] + '-' + x.split('-')[1] for x in list(df['Broad_ID'])]
df = df.merge(meta, on = 'Name', how = 'left')

# finally add in the fragment that we used
predfrags = pd.read_csv('../out/fragment_algorithm_pipeline_runs/12_NG_rd1/candidates_after_matching_and_filtering.csv')
matchdf = match_mols_with_frags(df, 'SMILES', rd6preds, 'fragment_SMILES')
df = df.merge(matchdf, on = 'SMILES', how = 'left')

# add metadata and save
df['Round'] = ['Round1-NG'] * len(df)
df['ID'] = df['Broad_ID']
df = df[['Round', 'Name', 'hit', 'SMILES', 'fragment_SMILES']]
df.to_csv('../data/experimental_validation/ng_rd1_experimental.csv', index = False)

print('NG Round 1 Statistics')
report_stats(df, hit_col = 'hit')

NG Round 1 Statistics
Number of compounds tested: 70
Number of confirmed hits: 7
Hit rate: 10.0%
