## Prepare data for analysis

In [122]:
import pandas as pd
from glob import glob
from scipy.stats import spearmanr, pearsonr
import numpy as np
import itertools
from fuzzywuzzy import fuzz # req. python-Levenshtein

### L1000

In [4]:
glob("../LINCS-Pilot1/L1000/*")

['../LINCS-Pilot1/L1000/level_5_rank.csv',
 '../LINCS-Pilot1/L1000/level_4.csv',
 '../LINCS-Pilot1/L1000/collapsed_replicates.csv',
 '../LINCS-Pilot1/L1000/replicate_level_l1k.csv',
 '../LINCS-Pilot1/L1000/level_4W.csv',
 '../LINCS-Pilot1/L1000/level_3.csv',
 '../LINCS-Pilot1/L1000/level_5_modz.csv']

In [6]:
level_4 = pd.read_csv('../LINCS-Pilot1/L1000/replicate_level_l1k.csv')
level_4

Unnamed: 0,cid,200814_at,222103_at,201453_x_at,204131_s_at,200059_s_at,205067_at,213702_x_at,214435_x_at,201334_s_at,...,nearest_dose,brew_prefix,group_id,nsig,cc_q75,tas_q75,pert_iname_y,pert_type_y,moa,pert_id_dose
0,REP.A001_A549_24H_X1_B27:A03,0.3547,-0.4940,-0.1721,-0.0339,-0.4355,1.8263,-0.1316,0.0853,-0.4660,...,-666.00,REP.A001_A549_24H,REP.A001_A549_24H:A03,500,0.1800,0.165592,DMSO,ctl_vehicle,Control vehicle,DMSO
1,REP.A001_A549_24H_X1_B27:A04,-0.2130,0.4931,-0.8768,-0.6968,-1.7018,-0.3779,-0.6745,-1.9799,-1.1429,...,-666.00,REP.A001_A549_24H,REP.A001_A549_24H:A04,500,0.1800,0.165592,DMSO,ctl_vehicle,Control vehicle,DMSO
2,REP.A001_A549_24H_X1_B27:A05,-0.3280,-0.4555,0.3819,-1.1447,0.0951,-0.5498,-0.6745,-0.0950,-0.2946,...,-666.00,REP.A001_A549_24H,REP.A001_A549_24H:A05,500,0.1800,0.165592,DMSO,ctl_vehicle,Control vehicle,DMSO
3,REP.A001_A549_24H_X1_B27:A06,0.1175,-0.9678,-0.3185,0.4623,-0.2842,-0.9202,-0.8413,0.7160,0.3384,...,-666.00,REP.A001_A549_24H,REP.A001_A549_24H:A06,500,0.1800,0.165592,DMSO,ctl_vehicle,Control vehicle,DMSO
4,REP.A001_A549_24H_X1_B27:A07,0.5254,0.0000,-0.4691,0.3652,-1.0688,0.0793,-1.1123,0.1908,-0.8249,...,10.00,REP.A001_A549_24H,REP.A001_A549_24H:A07,6,0.0850,0.086878,aminoguanidine,trt_cp,Nitric oxide synthase inhibitor,BRD-K25114078_10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27832,REP.A028_A549_24H_X3_B29:P20,-1.4560,-1.0966,0.8633,-0.4426,2.3494,-0.0464,0.3133,0.0838,-0.3306,...,3.33,REP.A028_A549_24H,REP.A028_A549_24H:P20,6,0.1275,0.127916,cyanocobalamin,trt_cp,Methylmalonyl CoA mutase stimulant|Vitamin B,BRD-A97502381_3.33
27833,REP.A028_A549_24H_X3_B29:P21,0.5672,0.2946,-1.9837,-0.6680,0.5777,0.0080,-0.8276,0.5831,0.5613,...,1.11,REP.A028_A549_24H,REP.A028_A549_24H:P21,6,0.1275,0.127916,cyanocobalamin,trt_cp,Methylmalonyl CoA mutase stimulant|Vitamin B,BRD-A97502381_1.11
27834,REP.A028_A549_24H_X3_B29:P22,-0.9993,-0.6982,0.1973,-0.6394,0.1946,-0.2364,-0.4217,-0.4877,-1.8497,...,0.37,REP.A028_A549_24H,REP.A028_A549_24H:P22,6,0.1275,0.127916,cyanocobalamin,trt_cp,Methylmalonyl CoA mutase stimulant|Vitamin B,BRD-A97502381_0.37
27835,REP.A028_A549_24H_X3_B29:P23,0.3532,-0.7262,-0.1420,0.6304,1.0234,0.1308,-0.4872,0.7299,0.4346,...,0.12,REP.A028_A549_24H,REP.A028_A549_24H:P23,6,0.1275,0.127916,cyanocobalamin,trt_cp,Methylmalonyl CoA mutase stimulant|Vitamin B,BRD-A97502381_0.12


In [7]:
# define columns with data, and columns with metadata
gex_cols =  [col for col in level_4.columns if '_at' in col] # get cols for gex data
other_cols = [col for col in level_4.columns if '_at' not in col] # get metadata cols

In [None]:
# Collapse replicates
progress = progressbar.ProgressBar() # pbar
group_ids = list(set(level_4['pert_id_dose'].tolist())) # get group ids for subsetting

df_list = [] # init list of rows
for groupid in progress(group_ids):
    # subset replicates
    subset = level_4[level_4['pert_id_dose']==groupid]
    
    if len(subset) == 1: # if only one replicate just keep it
        consensus_sig = subset[gex_cols]
        other_data = subset[other_cols]
        other_data_df = pd.DataFrame(other_data.iloc[0,:]).T
        other_data_df.reset_index(inplace=True)
        consensus_sig[other_data_df.columns]=other_data_df

    if len(subset) ==2: # get median if 2 replicates
        gex_data = subset[gex_cols]
        consensus_sig = pd.DataFrame(gex_data.median()).T 
        other_data = subset[other_cols]
        other_data_df = pd.DataFrame(other_data.iloc[0,:]).T
        other_data_df.reset_index(inplace=True)
        consensus_sig[other_data_df.columns]=other_data_df
     
    if len(subset) > 2:
        # extract gex data
        gex_data = subset[gex_cols]
        gex_data_t = gex_data.T

        # get correlation matrix
        corr_reps = pd.DataFrame(gex_data_t.corr(method='spearman'))
        a = np.array(corr_reps)
        np.fill_diagonal(a,'NaN')
        corr_reps_nan = pd.DataFrame(a)

        # turn correlation into weights
        col_weights = []
        for col in corr_reps_nan:
            col_weight = corr_reps_nan[col].sum()
            col_weights.append(col_weight)
        try:
            col_weights_norm = [float(i)/sum(col_weights) for i in col_weights]
        except RuntimeWarning:
            break

        # linear combination of weighted signatures
        linear_list = []
        for idx, weight in enumerate(col_weights_norm):
            linear = gex_data_t.iloc[:,idx]*col_weights_norm[idx]
            linear_list.append(linear)
        combined_sig = sum(linear_list)
        combined_sig = list(map(str, combined_sig))
        consensus_sig = pd.DataFrame(combined_sig).T
        consensus_sig.columns = gex_data.columns

        # merge with other cols
        other_data = subset[other_cols]
        other_data_df = pd.DataFrame(other_data.iloc[0,:]).T
        other_data_df.reset_index(inplace=True)
        consensus_sig[other_data_df.columns]=other_data_df

    # apppend
    df_list.append(consensus_sig)

In [None]:
# concat rows to get final df of collapsed replicates
collapsed_replicates = pd.concat(df_list)

collapsed_replicates.to_csv("../LINCS-Pilot1/L1000/collapsed_replicates.csv",index=False)

In [12]:
collapsed_replicates_l1000 = pd.read_csv("../LINCS-Pilot1/L1000/collapsed_replicates.csv")
collapsed_replicates_l1000

Unnamed: 0,200814_at,222103_at,201453_x_at,204131_s_at,200059_s_at,205067_at,213702_x_at,214435_x_at,201334_s_at,213721_at,...,nearest_dose,brew_prefix,group_id,nsig,cc_q75,tas_q75,pert_iname_y,pert_type_y,moa,pert_id_dose
0,0.127162,-0.294365,0.137961,-1.109703,0.656604,0.292215,0.311967,0.447751,0.037203,-0.340880,...,0.37,REP.A003_A549_24H,REP.A003_A549_24H:C10,6.0,0.1150,0.123915,pemirolast,trt_cp,Mediator release inhibitor,BRD-K31731454_0.37
1,1.118993,0.227189,-0.331341,0.945928,1.006607,-0.650126,0.684648,-0.326644,0.527731,-0.207947,...,1.11,REP.A027_A549_24H,REP.A027_A549_24H:B09,6.0,0.2100,0.185869,oglemilast,trt_cp,Phosphodiesterase inhibitor,BRD-K92303087_1.11
2,0.245591,0.282655,0.218872,0.684023,0.714155,-0.664439,-0.010907,-0.899365,0.231252,-0.909358,...,3.33,REP.A022_A549_24H,REP.A022_A549_24H:O14,6.0,0.2175,0.188351,elesclomol,trt_cp,Oxidative stress inducer,BRD-K82135108_3.33
3,-0.335456,0.586743,-0.591421,0.264353,0.540535,-1.283997,0.211161,0.778915,1.353914,-0.230887,...,0.37,REP.A011_A549_24H,REP.A011_A549_24H:M16,6.0,0.2175,0.177029,esmolol,trt_cp,Adrenergic receptor antagonist,BRD-A07395371_0.37
4,0.080376,0.158033,-0.015609,0.048414,0.541595,0.090768,0.163494,0.014289,0.136539,-0.274216,...,0.12,REP.A016_A549_24H,REP.A016_A549_24H:D05,6.0,0.1950,0.144414,ilomastat,trt_cp,Matrix metalloprotease inhibitor,BRD-K51662849_0.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8365,-0.025300,-0.622900,-0.100200,-0.095050,0.631850,-0.189400,-1.136650,0.164750,0.486000,1.446050,...,0.37,REP.A024_A549_24H,REP.A024_A549_24H:O16,6.0,0.0775,0.092266,bavisant,trt_cp,Histamine receptor antagonist,BRD-K79595931_0.37
8366,-0.613532,0.487759,-0.340595,-1.060928,-0.477456,1.896124,0.069570,0.890457,-0.556072,1.651810,...,3.33,REP.A004_A549_24H,REP.A004_A549_24H:A20,6.0,0.2150,0.188559,trimipramine,trt_cp,Norepinephrine reputake inhibitor|Tricyclic an...,BRD-A19195498_3.33
8367,-3.688331,-1.325052,-4.263548,-0.278700,-0.476889,3.471311,-6.102801,7.646394,0.513694,1.647447,...,0.37,REP.A023_A549_24H,REP.A023_A549_24H:F22,6.0,0.1075,0.088196,diacerein,trt_cp,Interleukin inhibitor,BRD-K69122748_0.37
8368,0.419450,-0.291217,0.388290,-1.011088,-0.436046,-0.783107,0.523110,0.540916,0.025963,-0.513424,...,0.37,REP.A026_A549_24H,REP.A026_A549_24H:B16,6.0,0.2700,0.213961,CYT387,trt_cp,,BRD-K87737963_0.37


### Cell Morphology

In [10]:
cell_painting = pd.read_csv('../LINCS-Pilot1/CellPainting/replicate_level_cp_normalized.csv.gz')
cell_painting.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Metadata_plate_map_name,Metadata_broad_sample,Metadata_mg_per_ml,Metadata_mmoles_per_liter,Metadata_solvent,Metadata_pert_id,Metadata_pert_mfc_id,Metadata_pert_well,Metadata_pert_id_vendor,Metadata_cell_id,...,pert_iname,InChIKey14,moa,target,broad_date,clinical_phase,alternative_moa,alternative_target,Metadata_dose_recode,Metadata_pert_id_dose
0,C-7161-01-LM6-018,DMSO,0.0,0.0,DMSO,,,A01,,A549,...,,,,,,,,,0.04,DMSO
1,C-7161-01-LM6-018,DMSO,0.0,0.0,DMSO,,,A02,,A549,...,,,,,,,,,0.04,DMSO
2,C-7161-01-LM6-018,DMSO,0.0,0.0,DMSO,,,A03,,A549,...,,,,,,,,,0.04,DMSO
3,C-7161-01-LM6-018,DMSO,0.0,0.0,DMSO,,,A04,,A549,...,,,,,,,,,0.04,DMSO
4,C-7161-01-LM6-018,DMSO,0.0,0.0,DMSO,,,A05,,A549,...,,,,,,,,,0.04,DMSO


In [None]:
# Collapse replicates
progress = progressbar.ProgressBar() # pbar
feature_cols = [col for col in cell_painting.columns if 'Metadata' not in col][:-13] # get cols for cp data
other_cols = list(set(cell_painting.columns) - set(feature_cols)) # get metadata cols
group_ids = list(set(cell_painting['Metadata_pert_id_dose'].tolist())) # get group ids for subsetting

df_list = [] # init list of rows
for groupid in progress(group_ids):
    # subset replicates
    subset = cell_painting[cell_painting['Metadata_pert_id_dose']==groupid]
    
    if len(subset) == 1: # if only one replicate just keep it
        consensus_sig = subset[feature_cols]
        other_data = subset[other_cols]
        other_data_df = pd.DataFrame(other_data.iloc[0,:]).T
        other_data_df.reset_index(inplace=True)
        consensus_sig[other_data_df.columns]=other_data_df

    if len(subset) ==2: # get median if 2 replicates
        cp_data = subset[feature_cols]
        consensus_sig = pd.DataFrame(cp_data.median()).T 
        other_data = subset[other_cols]
        other_data_df = pd.DataFrame(other_data.iloc[0,:]).T
        other_data_df.reset_index(inplace=True)
        consensus_sig[other_data_df.columns]=other_data_df
     
    if len(subset) > 2:
        # extract cp data
        cp_data = subset[feature_cols]
        cp_data_t = cp_data.T

        # get correlation matrix
        corr_reps = pd.DataFrame(cp_data_t.corr(method='spearman'))
        a = np.array(corr_reps)
        np.fill_diagonal(a,'NaN')
        corr_reps_nan = pd.DataFrame(a)

        # turn correlation into weights
        col_weights = []
        for col in corr_reps_nan:
            col_weight = corr_reps_nan[col].sum()
            col_weights.append(col_weight)
        try:
            col_weights_norm = [float(i)/sum(col_weights) for i in col_weights]
        except RuntimeWarning:
            break

        # linear combination of weighted signatures
        linear_list = []
        for idx, weight in enumerate(col_weights_norm):
            linear = cp_data_t.iloc[:,idx]*col_weights_norm[idx]
            linear_list.append(linear)
        combined_sig = sum(linear_list)
        combined_sig = list(map(str, combined_sig))
        consensus_sig = pd.DataFrame(combined_sig).T
        consensus_sig.columns = cp_data.columns

        # merge with other cols
        other_data = subset[other_cols]
        other_data_df = pd.DataFrame(other_data.iloc[0,:]).T
        other_data_df.reset_index(inplace=True)
        consensus_sig[other_data_df.columns]=other_data_df

    # apppend
    df_list.append(consensus_sig)

In [None]:
# concat rows to get final df of collapsed replicates
collapsed_replicates = pd.concat(df_list)

In [None]:
collapsed_replicates.to_csv("../LINCS-Pilot1/CellPainting/collapsed_replicates.csv",index=False)

In [13]:
collapsed_replicates_image = pd.read_csv("../LINCS-Pilot1/CellPainting/collapsed_replicates.csv")
collapsed_replicates_image

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_EulerNumber,Cells_AreaShape_Extent,Cells_AreaShape_FormFactor,Cells_AreaShape_MajorAxisLength,Cells_AreaShape_MaxFeretDiameter,...,broad_id,Metadata_broad_sample,Metadata_alternative_moa,alternative_target,Metadata_dose_recode,Metadata_InChIKey14,Metadata_pert_id_vendor,Metadata_broad_sample_type,Metadata_mmoles_per_liter,Batch_Date
0,0.930886,-0.005492,0.469358,2.018657,1.426170,0.0,-1.348686,0.043441,0.975899,0.929508,...,BRD-K93123848,BRD-K93123848-001-04-1,,,0.12,YABJJWZLRMPFSI,,trt,0.12346,2016-03-22
1,-0.885841,-0.361874,-0.011127,8.049175,4.271658,0.0,-6.700333,-0.471099,0.389882,0.568959,...,BRD-K72215350,BRD-K72215350-001-06-5,,,10.00,WDSCBUNMANHPFH,,trt,10.00000,2016-03-22
2,4.563952,0.024727,-0.004964,0.496530,0.066990,0.0,-2.239999,-8.065502,4.028972,4.260322,...,BRD-A62025033,BRD-A62025033-001-01-8,,,1.11,,,trt,1.11110,2016-03-22
3,0.689370,0.361192,0.230801,-0.151581,0.641382,0.0,0.301334,-3.074680,0.628350,0.743517,...,BRD-K31627533,BRD-K31627533-001-09-5,,,0.12,QTTRZHGPGKRAFB,,trt,0.12346,2016-03-22
4,0.891200,0.362412,0.558520,0.341306,0.597724,0.0,-0.846607,-0.214280,1.211368,1.318890,...,BRD-K04956647,BRD-K04956647-003-02-5,,,3.33,KOHIRBRYDXPAMZ,,trt,3.33330,2016-03-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9390,-1.264650,0.242205,-0.216349,0.073602,0.977056,0.0,0.077212,-1.266096,-1.168330,-1.200381,...,BRD-K55966568,BRD-K55966568-001-05-4,,,1.11,NHFDRBXTEDBWCZ,,trt,1.11110,2016-03-22
9391,-0.257541,0.149125,0.269169,-0.335828,-0.238082,0.0,0.676743,1.988955,-0.370205,-0.341625,...,BRD-K82225283,BRD-K82225283-001-03-1,,,3.33,YWXYYJSYQOXTPL,,trt,3.33330,2016-03-22
9392,3.695259,1.150862,0.591856,0.127483,0.819892,0.0,0.469361,-5.235946,4.264099,4.360206,...,BRD-K41708980,BRD-K41708980-001-01-6,,,3.33,YNDXUCZADRHECN,,trt,3.33330,2016-03-22
9393,1.624630,1.208450,0.115900,-0.843845,-0.921275,0.0,-1.336675,-4.687450,2.547390,2.075770,...,BRD-K44353683,BRD-K44353683-001-08-3,,,3.33,OELFLUMRDSZNSF,,trt,3.33330,2016-03-22


### Compound structural data

First, find intersection of image + l1000 data and create a lookup file

In [22]:
l1000_comps = list(set(collapsed_replicates_l1000['pert_id'].tolist()))
len(l1000_comps) # 1403 L1000 compounds (plus nan)

1404

In [24]:
image_comps = list(set(collapsed_replicates_image['Metadata_pert_id'].tolist()))
len(image_comps) # l570 image compounds (plus nan)

1571

In [25]:
both_data = list(set(l1000_comps)&set(image_comps))
len(both_data) # 1402 in both (plus nan)

1403

In [35]:
compoundinfo_beta = pd.read_csv("compoundinfo_beta.txt",sep="\t")
compoundinfo_beta
compoundinfo_filtered = compoundinfo_beta[compoundinfo_beta['pert_id'].isin(both_data)]
compoundinfo_filtered_perts = list(set(compoundinfo_filtered['pert_id'].tolist()))
len(compoundinfo_filtered_perts) # we have all of the compound data here!

1402

In [43]:
compound_metadata = compoundinfo_filtered.iloc[:,[0,1,3,4,5]]
compound_metadata

Unnamed: 0,pert_id,cmap_name,moa,canonical_smiles,inchi_key
1,BRD-A12237696,L-citrulline,,NC(CCCNC(N)=O)C(O)=O,RHGKLRLOHDJJDR-UHFFFAOYSA-N
8,BRD-K05674516,PSI-7976,,CC(C)OC(=O)[C@H](C)N[P@@](=O)(OC[C@H]1O[C@@H](...,TTZHDVOVKQGIBA-YBSJRAAASA-N
15,BRD-K26341917,L-ergothioneine,,C[N+](C)(C)[C@@H](Cc1c[nH]c(=S)[nH]1)C(O)=O,SSISHJJTAXXQAX-ZETCQYMHSA-O
18,BRD-K29133151,BRD-K29133151,,OC(=O)CCCC=C/C[C@@H]1CO[C@@H](O[C@@H]1c1ccccc1...,ZWAVGFSZMACJHA-PMNBYGLBSA-N
19,BRD-K29968218,KX-2391,,O=C(Cc1ccc(cn1)-c1ccc(OCCN2CCOCC2)cc1)NCc1ccccc1,HUNGUWOZPQBXGX-UHFFFAOYSA-N
...,...,...,...,...,...
39305,BRD-A66155091,trilostane,3beta-hydroxy-delta5-steroid dehydrogenase inh...,C[C@]12CC[C@@H]3[C@@H](CC[C@@]45O[C@@H]4C(=O)C...,RLQVKDVIBJCQGE-VWWPZJNLSA-N
39306,BRD-A66155091,trilostane,3beta-hydroxy-delta5-steroid dehydrogenase inh...,C[C@]12CC[C@@H]3[C@@H](CC[C@@]45O[C@@H]4C(=O)C...,RLQVKDVIBJCQGE-VWWPZJNLSA-N
39307,BRD-A66155091,trilostane,3beta-hydroxy-delta5-steroid dehydrogenase inh...,C[C@]12CC[C@@H]3[C@@H](CC[C@@]45O[C@@H]4C(=O)C...,RLQVKDVIBJCQGE-VWWPZJNLSA-N
39315,BRD-K99504665,goserelin-acetate,Gonadotropin releasing factor hormone receptor...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,BLCLNMBMMGCOAS-URPVMXJPSA-N


Fill in gaps for MoA from data files

In [59]:
image_moa = collapsed_replicates_image[collapsed_replicates_image['Metadata_pert_id'].isin(both_data)].loc[:,["Metadata_pert_id","moa"]]
image_moa.drop_duplicates(inplace=True)
image_moa.drop(image_moa.tail(1).index,inplace=True)
image_moa

Unnamed: 0,Metadata_pert_id,moa
0,BRD-K93123848,RAF inhibitor|VEGFR inhibitor
1,BRD-K72215350,5 alpha reductase inhibitor
2,BRD-A62025033,
3,BRD-K31627533,glucocorticoid receptor agonist
4,BRD-K04956647,adrenergic receptor antagonist
...,...,...
5895,BRD-K88429204,dihydrofolate reductase inhibitor
6205,BRD-K84683831,bacterial 30S ribosomal subunit inhibitor|bact...
6348,BRD-K95773607,acetylcholine receptor antagonist
6421,BRD-K93460210,serotonin receptor antagonist|sodium channel b...


In [61]:
l1000_moa = collapsed_replicates_l1000[collapsed_replicates_l1000['pert_id'].isin(both_data)].loc[:,["pert_id","moa"]]
l1000_moa.drop_duplicates(inplace=True)
l1000_moa

Unnamed: 0,pert_id,moa
0,BRD-K31731454,Mediator release inhibitor
1,BRD-K92303087,Phosphodiesterase inhibitor
2,BRD-K82135108,Oxidative stress inducer
3,BRD-A07395371,Adrenergic receptor antagonist
4,BRD-K51662849,Matrix metalloprotease inhibitor
...,...,...
5273,BRD-A48237631,DNA alkylating drug|DNA inhibitor|DNA alkylati...
5370,BRD-K43887077,Dopamine receptor agonist
5392,BRD-K13819402,Acetylcholinesterase inhibitor|Monoamine oxida...
5807,BRD-K91544578,Immunostimulant


In [68]:
moa_both = pd.merge(l1000_moa,image_moa,left_on="pert_id",right_on="Metadata_pert_id",suffixes=("_l1000","_image")).drop(['Metadata_pert_id'],axis=1)
moa_both

Unnamed: 0,pert_id,moa_l1000,moa_image
0,BRD-K31731454,Mediator release inhibitor,mediator release inhibitor
1,BRD-K92303087,Phosphodiesterase inhibitor,phosphodiesterase inhibitor
2,BRD-K82135108,Oxidative stress inducer,oxidative stress inducer
3,BRD-A07395371,Adrenergic receptor antagonist,adrenergic receptor antagonist
4,BRD-K51662849,Matrix metalloprotease inhibitor,matrix metalloprotease inhibitor
...,...,...,...
1397,BRD-A48237631,DNA alkylating drug|DNA inhibitor|DNA alkylati...,DNA alkylating agent|DNA synthesis inhibitor
1398,BRD-K43887077,Dopamine receptor agonist,dopamine receptor agonist
1399,BRD-K13819402,Acetylcholinesterase inhibitor|Monoamine oxida...,acetylcholinesterase inhibitor|monoamine oxida...
1400,BRD-K91544578,Immunostimulant,immunostimulant


In [76]:
all_metadata = pd.merge(compound_metadata,moa_both,on="pert_id")
all_metadata = all_metadata.drop_duplicates(subset=["pert_id"])
all_metadata

Unnamed: 0,pert_id,cmap_name,moa,canonical_smiles,inchi_key,moa_l1000,moa_image
0,BRD-A12237696,L-citrulline,,NC(CCCNC(N)=O)C(O)=O,RHGKLRLOHDJJDR-UHFFFAOYSA-N,,nitric oxide stimulant
1,BRD-K05674516,PSI-7976,,CC(C)OC(=O)[C@H](C)N[P@@](=O)(OC[C@H]1O[C@@H](...,TTZHDVOVKQGIBA-YBSJRAAASA-N,RNA polymerase inhibitor,HCV inhibitor
2,BRD-K26341917,L-ergothioneine,,C[N+](C)(C)[C@@H](Cc1c[nH]c(=S)[nH]1)C(O)=O,SSISHJJTAXXQAX-ZETCQYMHSA-O,,free radical scavenger
3,BRD-K29133151,BRD-K29133151,,OC(=O)CCCC=C/C[C@@H]1CO[C@@H](O[C@@H]1c1ccccc1...,ZWAVGFSZMACJHA-PMNBYGLBSA-N,,thromboxane receptor antagonist
4,BRD-K29968218,KX-2391,,O=C(Cc1ccc(cn1)-c1ccc(OCCN2CCOCC2)cc1)NCc1ccccc1,HUNGUWOZPQBXGX-UHFFFAOYSA-N,,SRC inhibitor|tubulin polymerization inhibitor
...,...,...,...,...,...,...,...
3471,BRD-K21295289,senicapoc,Intermediate conductance potassium channel blo...,NC(=O)C(c1ccccc1)(c1ccc(F)cc1)c1ccc(F)cc1,SCTZUZTYRMOMKT-UHFFFAOYSA-N,Intermediate conductance potassium channel blo...,intermediate conductance potassium channel blo...
3472,BRD-K92213669,lomitapide,Microsomal trigylceride transfer protein inhib...,FC(F)(F)CNC(=O)C1(CCCCN2CCC(CC2)NC(=O)c2ccccc2...,MBBCVAKAJPKAKM-UHFFFAOYSA-N,Microsomal trigylceride transfer protein inhib...,microsomal trigylceride transfer protein inhib...
3473,BRD-A66155091,trilostane,3beta-hydroxy-delta5-steroid dehydrogenase inh...,C[C@]12CC[C@@H]3[C@@H](CC[C@@]45O[C@@H]4C(=O)C...,RLQVKDVIBJCQGE-VWWPZJNLSA-N,3beta-hydroxy-delta5-steroid dehydrogenase inh...,3beta-hydroxy-delta5-steroid dehydrogenase inh...
3476,BRD-K99504665,goserelin-acetate,Gonadotropin releasing factor hormone receptor...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,BLCLNMBMMGCOAS-URPVMXJPSA-N,Gonadotropin releasing factor hormone receptor...,gonadotropin releasing factor hormone receptor...


In [75]:
all_metadata = all_metadata.drop_duplicates(subset=["pert_id"])

Unnamed: 0,pert_id,cmap_name,moa,canonical_smiles,inchi_key,moa_l1000,moa_image
0,BRD-A12237696,L-citrulline,,NC(CCCNC(N)=O)C(O)=O,RHGKLRLOHDJJDR-UHFFFAOYSA-N,,nitric oxide stimulant
1,BRD-K05674516,PSI-7976,,CC(C)OC(=O)[C@H](C)N[P@@](=O)(OC[C@H]1O[C@@H](...,TTZHDVOVKQGIBA-YBSJRAAASA-N,RNA polymerase inhibitor,HCV inhibitor
2,BRD-K26341917,L-ergothioneine,,C[N+](C)(C)[C@@H](Cc1c[nH]c(=S)[nH]1)C(O)=O,SSISHJJTAXXQAX-ZETCQYMHSA-O,,free radical scavenger
3,BRD-K29133151,BRD-K29133151,,OC(=O)CCCC=C/C[C@@H]1CO[C@@H](O[C@@H]1c1ccccc1...,ZWAVGFSZMACJHA-PMNBYGLBSA-N,,thromboxane receptor antagonist
4,BRD-K29968218,KX-2391,,O=C(Cc1ccc(cn1)-c1ccc(OCCN2CCOCC2)cc1)NCc1ccccc1,HUNGUWOZPQBXGX-UHFFFAOYSA-N,,SRC inhibitor|tubulin polymerization inhibitor
...,...,...,...,...,...,...,...
3471,BRD-K21295289,senicapoc,Intermediate conductance potassium channel blo...,NC(=O)C(c1ccccc1)(c1ccc(F)cc1)c1ccc(F)cc1,SCTZUZTYRMOMKT-UHFFFAOYSA-N,Intermediate conductance potassium channel blo...,intermediate conductance potassium channel blo...
3472,BRD-K92213669,lomitapide,Microsomal trigylceride transfer protein inhib...,FC(F)(F)CNC(=O)C1(CCCCN2CCC(CC2)NC(=O)c2ccccc2...,MBBCVAKAJPKAKM-UHFFFAOYSA-N,Microsomal trigylceride transfer protein inhib...,microsomal trigylceride transfer protein inhib...
3473,BRD-A66155091,trilostane,3beta-hydroxy-delta5-steroid dehydrogenase inh...,C[C@]12CC[C@@H]3[C@@H](CC[C@@]45O[C@@H]4C(=O)C...,RLQVKDVIBJCQGE-VWWPZJNLSA-N,3beta-hydroxy-delta5-steroid dehydrogenase inh...,3beta-hydroxy-delta5-steroid dehydrogenase inh...
3476,BRD-K99504665,goserelin-acetate,Gonadotropin releasing factor hormone receptor...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,BLCLNMBMMGCOAS-URPVMXJPSA-N,Gonadotropin releasing factor hormone receptor...,gonadotropin releasing factor hormone receptor...


In [77]:
all_metadata.to_csv("compound_metadata.txt",sep="\t",index=None)

Now for every row, we want a final set of MoA labels

In [153]:
# function
def clean_moa(row,meta_moa,lincs_moa,image_moa):
    # split by "|"
    split_moas = []
    for moa in [meta_moa,lincs_moa,image_moa]:
        try:
            moa_list = moa.split("|")
        except:
            continue
        split_moas.append(moa_list)
    
    # flatten list
    moas_flat = [item for sublist in split_moas for item in sublist]
    
    # get unique moas
    if len(moas_flat)>0: # don't perform if no moas
        moas_lower = [moa.lower() for moa in moas_flat]
        moas_unique = list(set(moas_lower))
        moas_col = '|'.join(str(e) for e in moas_unique)
        
    # add back to row
    cleaned_row = row
    cleaned_row['moa_cleaned'] = moas_col

    # return row with added, cleaned moa col
    return(cleaned_row)
    

In [155]:
# apply the function
cleaned_metadata = all_metadata.apply(lambda row: clean_moa(row,row['moa'], row['moa_l1000'],row['moa_image']), axis=1)
cleaned_metadata

Unnamed: 0,pert_id,cmap_name,moa,canonical_smiles,inchi_key,moa_l1000,moa_image,moa_cleaned
0,BRD-A12237696,L-citrulline,,NC(CCCNC(N)=O)C(O)=O,RHGKLRLOHDJJDR-UHFFFAOYSA-N,,nitric oxide stimulant,nitric oxide stimulant
1,BRD-K05674516,PSI-7976,,CC(C)OC(=O)[C@H](C)N[P@@](=O)(OC[C@H]1O[C@@H](...,TTZHDVOVKQGIBA-YBSJRAAASA-N,RNA polymerase inhibitor,HCV inhibitor,hcv inhibitor|rna polymerase inhibitor
2,BRD-K26341917,L-ergothioneine,,C[N+](C)(C)[C@@H](Cc1c[nH]c(=S)[nH]1)C(O)=O,SSISHJJTAXXQAX-ZETCQYMHSA-O,,free radical scavenger,free radical scavenger
3,BRD-K29133151,BRD-K29133151,,OC(=O)CCCC=C/C[C@@H]1CO[C@@H](O[C@@H]1c1ccccc1...,ZWAVGFSZMACJHA-PMNBYGLBSA-N,,thromboxane receptor antagonist,thromboxane receptor antagonist
4,BRD-K29968218,KX-2391,,O=C(Cc1ccc(cn1)-c1ccc(OCCN2CCOCC2)cc1)NCc1ccccc1,HUNGUWOZPQBXGX-UHFFFAOYSA-N,,SRC inhibitor|tubulin polymerization inhibitor,src inhibitor|tubulin polymerization inhibitor
...,...,...,...,...,...,...,...,...
3471,BRD-K21295289,senicapoc,Intermediate conductance potassium channel blo...,NC(=O)C(c1ccccc1)(c1ccc(F)cc1)c1ccc(F)cc1,SCTZUZTYRMOMKT-UHFFFAOYSA-N,Intermediate conductance potassium channel blo...,intermediate conductance potassium channel blo...,intermediate conductance potassium channel blo...
3472,BRD-K92213669,lomitapide,Microsomal trigylceride transfer protein inhib...,FC(F)(F)CNC(=O)C1(CCCCN2CCC(CC2)NC(=O)c2ccccc2...,MBBCVAKAJPKAKM-UHFFFAOYSA-N,Microsomal trigylceride transfer protein inhib...,microsomal trigylceride transfer protein inhib...,microsomal trigylceride transfer protein inhib...
3473,BRD-A66155091,trilostane,3beta-hydroxy-delta5-steroid dehydrogenase inh...,C[C@]12CC[C@@H]3[C@@H](CC[C@@]45O[C@@H]4C(=O)C...,RLQVKDVIBJCQGE-VWWPZJNLSA-N,3beta-hydroxy-delta5-steroid dehydrogenase inh...,3beta-hydroxy-delta5-steroid dehydrogenase inh...,3beta-hydroxy-delta5-steroid dehydrogenase inh...
3476,BRD-K99504665,goserelin-acetate,Gonadotropin releasing factor hormone receptor...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,BLCLNMBMMGCOAS-URPVMXJPSA-N,Gonadotropin releasing factor hormone receptor...,gonadotropin releasing factor hormone receptor...,gonadotropin releasing factor hormone receptor...


In [156]:
# save metadata file
cleaned_metadata.to_csv("cleaned_metadata.txt",sep="\t",index=None)