In [1]:
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'

# Helper functions

In [2]:
def class_balance_random_split_smiles(df, smiles_col, hit_col, perc_train=0.8):

    pos = df[df[hit_col] == 1.0]
    postrain = pos.sample(frac=perc_train)
    postrainnames = list(postrain[smiles_col])
    postest = pd.DataFrame()
    for _, r in pos.iterrows():
        if r[smiles_col] not in postrainnames:
            postest = pd.concat([postest, r.to_frame().T], ignore_index=True)
            # postest = postest.append(r)
    print("pos train: " + str(len(postrain)))
    print("pos test: " + str(len(postest)))

    neg = df[df[hit_col] == 0.0]
    negtrain = neg.sample(frac=perc_train)
    negtrainnames = list(negtrain[smiles_col])
    negtest = pd.DataFrame()
    for _, r in neg.iterrows():
        if r[smiles_col] not in negtrainnames:
            negtest = pd.concat([negtest, r.to_frame().T], ignore_index=True)
            # negtest = negtest.append(r)
    print("neg train: " + str(len(negtrain)))
    print("neg test: " + str(len(negtest)))

    traindf = pd.concat([postrain, negtrain])
    testdf = pd.concat([postest, negtest])
    print("train size: " + str(len(traindf)))
    print("test size: " + str(len(testdf)))

    fulldf = pd.concat([traindf, testdf])
    print("fulldf size: " + str(len(fulldf)))

    return (traindf, testdf, fulldf)


def trim_df(df, name_col="Name", smiles_col="SMILES", hit_col="NG_hit"):
    df = df[[name_col, smiles_col, hit_col]]
    df.columns = ["Name", "SMILES", "hit"]
    df.loc[:, "hit"] = [float(x) for x in list(df["hit"])]
    return df


def correct_problematic_smiles(df, smiles_dict, name_col="Name", smiles_col="SMILES"):
    # assumes smiles dict is of the form Name:SMILES
    names = list(df[name_col])
    for n, i in enumerate(list(df[smiles_col])):
        # replaces NaN SMILES that are provided in dictionary
        if type(i) == float:
            name = names[n]
            if name in list(smiles_dict.keys()):
                df.iloc[n, df.columns.get_loc(smiles_col)] = smiles_dict[name]
    return df


def check_differing_performance(df):
    unique_smi_list = []
    num_dup_smis = 0
    questiondf = pd.DataFrame(
        columns=["Name", "SMILES", "hit"]
    )  # dfs have been standardized for same columns
    for smi in list(df["SMILES"]):
        if smi in unique_smi_list:
            num_dup_smis = num_dup_smis + 1
            testdf = df[df["SMILES"] == smi]
            if len(set(list(testdf["hit"]))) > 1:
                questiondf = pd.concat([questiondf, testdf])
        else:
            unique_smi_list.append(smi)
    return questiondf

In [3]:
# Get strings associated with Pharmakon library


# pk smiles import
pk = pd.read_excel("../data/library_info/PK180301.xls")  # from Broad
smilesmap = pd.read_csv(
    "../data/library_info/pk_np_smiles_mapping_manual.csv"
)  # from Felix manual validation
smilesmap = smilesmap.iloc[:, 0:2]

# manipulate via Name because that is labelled data for all PK screens
pk = pk.merge(smilesmap, left_on="MOLENAME", right_on="NAME", how="left")
pk = pk.drop_duplicates("MOLENAME")

# Round 1 data: Pharmakon screen alone

In [4]:
# read in data
df = pd.read_csv("../data/screening_data/NeisseriaGonorrhoeaePharmakonScreen.csv")

# now merge pk with data
df = df[["Name", "NG_hit"]]
# this does remove 3 cpds with NaN SMILES (DEFERASIROX, TETRABENAZINE, D-(-)-FRUCTOSE)- fixed on second data round
df = df.merge(pk, left_on="Name", right_on="NAME", how="inner")
df = trim_df(df)
df.to_csv("../data/cleaned_screening_data/pk_screen_cleaned.csv", index=False)
df

Unnamed: 0,Name,SMILES,hit
0,GRISEOFULVIN,COC1=CC(=O)CC(C)C12Oc1c(Cl)c(OC)cc(OC)c1C2=O,0.0
1,SALSALATE,O=C(Oc1ccccc1C(=O)O)c1ccccc1O,0.0
2,DANTHRON,O=C1c2cccc(O)c2C(=O)c2c(O)cccc21,0.0
3,MEQUINOL,COc1ccc(O)cc1,0.0
4,HYDROCORTISONE,CC12CCC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)CO,0.0
...,...,...,...
1758,METICRANE,Cc1cc2c(cc1S(N)(=O)=O)S(=O)(=O)CCC2,0.0
1759,MEPARFYLON,C#CC(C)(O)CC,0.0
1760,PICONOL,OCc1ccccn1,0.0
1761,DESLORATIDINE,Clc1ccc2c(c1)CCc1cccnc1C2=C1CCNCC1,0.0


In [5]:
out_dir = "../data/data_prep_for_ml/data_prep_for_ml_pk_screen/"
traindf, testdf, fulldf = class_balance_random_split_smiles(df, "SMILES", "hit")
# traindf.to_csv(out_dir + 'TRAIN_11_15_2021.csv', index = False)
# testdf.to_csv(out_dir + 'TEST_11_15_2021.csv', index = False)
# fulldf.to_csv(out_dir + 'FULL_11_15_2021.csv', index = False)
# pos train: 262
# pos test: 65
# neg train: 1149
# neg test: 283
# train size: 1411
# test size: 348
# fulldf size: 1759

pos train: 262
pos test: 65
neg train: 1149
neg test: 284
train size: 1411
test size: 349
fulldf size: 1760


# Part 2: Pharmakon + 37K Screen

In [6]:
df = pd.read_csv(
    "../data/screening_data/Ngonorrhoeae_PharmakonAnd37K_all_forJackie.csv"
)

collins37kdf = df[df["Library"] == "37K"]
collins37kdf = trim_df(collins37kdf)

# now merge pk with data
pharmdf = df[df["Library"] == "Pharmakon"]
pharmdf = pharmdf[["Name", "NG_hit"]]
# now do a left merge because we will manually correct those cpds
pharmdf = pharmdf.merge(pk, left_on="Name", right_on="NAME", how="left")
pharmdf = trim_df(pharmdf)
# manual correction of problematic SMILES
missing_dict = {
    "DEFERASIROX": "C1=CC=C(C(=C1)C2=NN(C(=N2)C3=CC=CC=C3O)C4=CC=C(C=C4)C(=O)O)O",
    "TETRABENAZINE": "CC(C)CC1CN2CCC3=CC(=C(C=C3C2CC1=O)OC)OC",
    "D-(-)-FRUCTOSE": "C(C(C(C(C(=O)CO)O)O)O)O",
}
pharmdf = correct_problematic_smiles(pharmdf, missing_dict)

df = pd.concat([pharmdf, collins37kdf])
df.to_csv("../data/cleaned_screening_data/pk_37k_screen_cleaned.csv", index=False)
df

Unnamed: 0,Name,SMILES,hit
0,GRISEOFULVIN,COC1=CC(=O)CC(C)C12Oc1c(Cl)c(OC)cc(OC)c1C2=O,0.0
1,SALSALATE,O=C(Oc1ccccc1C(=O)O)c1ccccc1O,0.0
2,DANTHRON,O=C1c2cccc(O)c2C(=O)c2c(O)cccc21,0.0
3,MEQUINOL,COc1ccc(O)cc1,0.0
4,HYDROCORTISONE,CC12CCC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)CO,0.0
...,...,...,...
38729,BRD-K55781443,C[C@H]1CN([C@@H](C)CO)C(=O)c2cccc(c2O[C@H]1CN(...,0.0
38730,BRD-K84613978,C[C@H]1CN([C@@H](C)CO)C(=O)Cc2cc(ccc2O[C@H]1CN...,0.0
38731,BRD-K44892556,C[C@@H]1CN([C@H](C)CO)C(=O)Cc2cc(ccc2O[C@H]1CN...,0.0
38732,BRD-K93515420,C[C@H]1CN([C@H](C)CO)C(=O)Cc2cc(ccc2O[C@H]1CN(...,0.0


In [7]:
out_dir = "../data/data_prep_for_ml/data_prep_for_ml_pk_37k_screen/"
traindf, testdf, fulldf = class_balance_random_split_smiles(df, "SMILES", "hit")
# traindf.to_csv(out_dir + 'TRAIN_03_19_2022.csv', index = False)
# testdf.to_csv(out_dir + 'TEST_03_19_2022.csv', index = False)
# fulldf.to_csv(out_dir + 'FULL_03_19_2022.csv', index = False)
# pos train: 1017
# pos test: 252
# neg train: 29950
# neg test: 7461
# train size: 30967
# test size: 7713
# fulldf size: 38680

pos train: 1017
pos test: 252
neg train: 29950
neg test: 7460
train size: 30967
test size: 7712
fulldf size: 38679


# Part 3: Pharmakon, 37K Screen, and predicted+validated compounds

In [3]:
pk_37k = pd.read_csv("../data/cleaned_screening_data/pk_37k_screen_cleaned.csv")
new_mols = pd.read_excel(
    "../data/validated_model_predictions/2022-10-21 Broad800K validation with MP order 1 data summary Jackie.xlsx",
    header=1,
)
new_mols = trim_df(new_mols, name_col="Full Name")
df = pd.concat([pk_37k, new_mols])

# here, I realized there were some duplicate cpds that had differing performance in the PK library and 37K library
questiondf = check_differing_performance(df)
questiondf

Unnamed: 0,Name,SMILES,hit
476,TRAMIPROSATE,NCCCS(=O)(=O)O,1.0
9433,BRD-K82234479,NCCCS(=O)(=O)O,0.0
1009,TRIMETHADIONE,CN1C(=O)OC(C)(C)C1=O,1.0
11316,BRD-K16606819,CN1C(=O)OC(C)(C)C1=O,0.0
11368,BRD-K00004542,OS(=O)(=O)c1ccc(cc1)/C/1=C\2/N/C(=C(\C3=N/C(=C...,0.0
11380,BRD-K00004542,OS(=O)(=O)c1ccc(cc1)/C/1=C\2/N/C(=C(\C3=N/C(=C...,1.0
28972,BRD-K24871708,C/C=C/C1=C(N2[C@H](SC1)[C@H](NC(=O)[C@H](N)c1c...,1.0
29243,BRD-K24871708,C/C=C/C1=C(N2[C@H](SC1)[C@H](NC(=O)[C@H](N)c1c...,0.0
11245,BRD-K55044200,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(cc3)O)...,0.0
29395,BRD-K55044200,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(cc3)O)...,1.0


All four of the compounds that were discrepant in the Pharmakon and 37K libraries robustly inhibited growth in the Pharmakon set and did not inhibit growth in the 37K. I assume what happened is the compounds degraded or were lower purity in the 37K library, and the growth inhibition seen in the Pharmakon screen was real. There is some literature to support the killing ability of some of these compounds. So can we make these consistently "1"?

These four compounds robustly inhibited growth in the Pharmakon set and did not inhibit growth in the 37K. The compounds may have degraded or are of lower purity in the 37K library. There is some literature to support the killing ability of these compounds, so we will consistently make these compounds hits:
TRAMIPROSATE/BRD-K82234479	
TRIMETHADIONE/BRD-K16606819 
HYDROCHLOROTHIAZIDE/BRD-K13078532  
ZONISAMIDE/BRD-K48300629

For BRD-K00004542, there was visible growth in the well, so this should be a non-hit. BRD-K24871708 and BRD-K55044200 should both be hits.

In [4]:
# manually correct these
df = pd.concat([pk_37k, new_mols])

correctdf = questiondf[questiondf["hit"] == 1.0]
correctdf.loc[correctdf[correctdf["Name"] == "BRD-K00004542"].index[0], "hit"] = 0.0
question_smis = list(set(list(questiondf["SMILES"])))
df = df[[smi not in question_smis for smi in list(df["SMILES"])]]

df = df.drop_duplicates("SMILES")
df = pd.concat([df, correctdf])
df.to_csv(
    "../data/cleaned_screening_data/pk_37k_first_round_val_cleaned.csv", index=False
)
df

Unnamed: 0,Name,SMILES,hit
0,GRISEOFULVIN,COC1=CC(=O)CC(C)C12Oc1c(Cl)c(OC)cc(OC)c1C2=O,0.0
1,SALSALATE,O=C(Oc1ccccc1C(=O)O)c1ccccc1O,0.0
2,DANTHRON,O=C1c2cccc(O)c2C(=O)c2c(O)cccc21,0.0
3,MEQUINOL,COc1ccc(O)cc1,0.0
4,HYDROCORTISONE,CC12CCC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)CO,0.0
...,...,...,...
11380,BRD-K00004542,OS(=O)(=O)c1ccc(cc1)/C/1=C\2/N/C(=C(\C3=N/C(=C...,0.0
28972,BRD-K24871708,C/C=C/C1=C(N2[C@H](SC1)[C@H](NC(=O)[C@H](N)c1c...,1.0
29395,BRD-K55044200,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(cc3)O)...,1.0
191,HYDROCHLOROTHIAZIDE,NS(=O)(=O)c1cc2c(cc1Cl)NCNS2(=O)=O,1.0


In [10]:
out_dir = "../data/data_prep_for_ml/data_prep_for_ml_pk_37k_first_round_val_screen/"
traindf, testdf, fulldf = class_balance_random_split_smiles(df, "SMILES", "hit")

# traindf.to_csv(out_dir + 'TRAIN_10_26_2022.csv', index = False)
# testdf.to_csv(out_dir + 'TEST_10_26_2022.csv', index = False)
# fulldf.to_csv(out_dir + 'FULL_10_26_2022.csv', index = False)
# pos train: 1069
# pos test: 267
# neg train: 29943
# neg test: 7486
# train size: 31012
# test size: 7753

pos train: 1069
pos test: 267
neg train: 29943
neg test: 7486
train size: 31012
test size: 7753
fulldf size: 38765


# Part 4: Pharmakon, 37K Screen, and two additional rounds of model prediction validation

In [22]:
pk_37k_1strd = pd.read_csv(
    "../data/cleaned_screening_data/pk_37k_first_round_val_cleaned.csv"
)
new_mols = pd.read_excel(
    "../data/validated_model_predictions/2023-03-27 Data update for Jackie.xlsx",
    header=0,
)

# get the SMILES for compounds from the Broad800K
broad800k = pd.read_csv("../data/library_info/PublicStructures.txt", sep="\t")
broad800k = broad800k[["Name", "SMILES"]]
new_mols = new_mols.merge(broad800k, left_on="Compound ID", right_on="Name", how="left")

# get the SMILES from external compounds
new_mol_metadata = pd.read_excel(
    "../data/library_info/Broad Compound Registration Melis to date.xlsx"
)
new_mol_metadata = new_mol_metadata[["Assigned Broad Name", "SMILES"]]
new_mols = new_mols.merge(
    new_mol_metadata, left_on="Batch ID", right_on="Assigned Broad Name", how="left"
)

# get one SMILES column
new_mols["SMILES"] = [
    x if type(x) is str else y
    for x, y in zip(list(new_mols["SMILES_x"]), list(new_mols["SMILES_y"]))
]
new_mols = new_mols[["Compound ID", "NG_hit", "SMILES"]]
new_mols.columns = ["Name", "hit", "SMILES"]

# finally merge with last round data
print(len(new_mols), len(pk_37k_1strd))
df = pd.concat([pk_37k_1strd, new_mols]).reset_index(drop=True)
df = df.drop_duplicates("SMILES")
df.to_csv(
    "../data/cleaned_screening_data/pk_37k_three_rounds_val_cleaned.csv", index=False
)
df

98 38791


  warn(msg)


Unnamed: 0,Name,SMILES,hit
0,GRISEOFULVIN,COC1=CC(=O)CC(C)C12Oc1c(Cl)c(OC)cc(OC)c1C2=O,0.0
1,SALSALATE,O=C(Oc1ccccc1C(=O)O)c1ccccc1O,0.0
2,DANTHRON,O=C1c2cccc(O)c2C(=O)c2c(O)cccc21,0.0
3,MEQUINOL,COc1ccc(O)cc1,0.0
4,HYDROCORTISONE,CC12CCC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)CO,0.0
...,...,...,...
38884,BRD-K34988597,Cc1cc(cc(c1-c1n[nH]c(c1Oc1ccccc1)C)O)O,0.0
38885,BRD-K34224013,OC(=O)c1cccc(c1)Nc1nc(cs1)-c1ccccn1,0.0
38886,BRD-K00126774,COC(=O)c1ccc(C[n+]2cc(-c3ccccc3)n3CCCc23)cc1.Br,0.0
38887,BRD-K00127413,COc1ccc(cc1)C(=O)C[n+]1cc(-c2ccc(OC)cc2)n2CCCc...,0.0


In [23]:
out_dir = "../data/data_prep_for_ml/data_prep_for_ml_pk_37k_three_rounds_val/"
traindf, testdf, fulldf = class_balance_random_split_smiles(df, "SMILES", "hit")

traindf.to_csv(out_dir + "TRAIN_03_31_2023.csv", index=False)
testdf.to_csv(out_dir + "TEST_03_31_2023.csv", index=False)
fulldf.to_csv(out_dir + "FULL_03_31_2023.csv", index=False)
# pos train: 1085
# pos test: 271
# neg train: 30003
# neg test: 7501
# train size: 31088
# test size: 7772
# fulldf size: 38860 - always lose the 26 NaNs performance

pos train: 1085
pos test: 271
neg train: 30003
neg test: 7501
train size: 31088
test size: 7772
fulldf size: 38860
