In [1]:
#import packages and define functions

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
from statsmodels import robust
from collections import OrderedDict, defaultdict
import warnings
from functools import reduce
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

#define index of dispersion function as standard deviation squared over the mean intensity
def index_of_dispersion(df, stddev_column,mean_intensity_column):
    stddev_squared = df[stddev_column].apply(lambda x: x*x)
    d = stddev_squared/df[mean_intensity_column]
    return d


#parameters for marker perturbation 
def marker_perturbation_score(df):
    if (-2.5 < df['mad_z_score'] < 2.5):
        return 0
    elif (-5 < df['mad_z_score'] < -2.5):
        return 0.5
    elif (2.5 < df['mad_z_score'] < 5):
        return 0.5
    elif (df['mad_z_score'] < -5):
        return 1
    elif (df['mad_z_score'] > 5):
        return 1

def score(df,ids, groups,raw_value, ad_value):
    df1 = df.melt(id_vars = ids, value_vars = [raw_value, ad_value],var_name = ['inputs'])
    print('df1' + '_' + str(raw_value) + ':' + str(len(df1)))
    
    dmso_only= df1[df1.control_or_sample == 'N1'] #selects for control values
    print('dmso_only:' + str(len(dmso_only)))
    
    control_median = dmso_only.loc[dmso_only['inputs'] == raw_value].groupby(groups).agg(
        'median').reset_index().rename(columns = {'value': 'control_medians'}) 
    print('control_median:' + str(len(control_median)))

    #calculate control mads by taking the median of the absolute deviation 
    control_mad = dmso_only[dmso_only.inputs == ad_value].groupby(groups).agg(
        'median').reset_index().rename(columns = {'value': 'control_mad'})
    print('control_median:' + str(len(control_median)))
    
    #merge median and mad columns
    medians_and_mads = control_median.merge(control_mad)
    print('medians_and_mads:' + str(len(medians_and_mads)))

    #merge control medians and mads back into original data frame
    melted_harmony_file_by_control = df1.merge(medians_and_mads, how = 'left' ,on = [channel_column1,channel_column2,channel_column3,'cell_line','user_timepoint'])    
    print('melted_harmony_file_by_control:' + str(len(melted_harmony_file_by_control)))
    
    #remove absolute deviation values as they are not needed
    
    harmony_file_by_control = melted_harmony_file_by_control[melted_harmony_file_by_control.inputs != ad_value]

    harmony_file_by_control.insert(harmony_file_by_control.shape[1], 'mad_z_score', (
        (harmony_file_by_control.loc[:,'value']) - (harmony_file_by_control.loc[:,'control_medians']))/(harmony_file_by_control.loc[:,'control_mad']))
    #clean
    mad_z_score = harmony_file_by_control.copy().drop(columns = ['compound_conc_y','cell_count_y','inputs'])   
    
    #apply marker perturbation function to each value of mad_z_score,
    mad_z_score['marker_perturbation_score'] = mad_z_score.apply(marker_perturbation_score, axis = 1)
    print('mad_z_score:' + str(len(mad_z_score)))
    
    return mad_z_score 







FOR JOHN AND DREW

In [13]:
## copy path file here 
harmony_file_1 = pd.read_excel('/Users/juliannalamm/Library/CloudStorage/OneDrive-SharedLibraries-DewpointTherapeutics/dpaint - dpaint_data/core_experiments/dpaint-001/raw_files/foundry_files/evaluation2_updatedpipeline_foundryfile/dpaint-001_220607.xlsx') #import files

#define channels here 
channel1 = '488'
channel2 = '405'
channel3 = '647'

In [14]:


# drops the notes header in original file, not needed if there is no 
# harmony_file_1 = harmony_file_1.rename(columns = harmony_file_1.iloc[0]).drop(harmony_file_1.index[0])

#convert column and row numbers to string for processing later 
harmony_file_1['column'] = harmony_file_1['column'].astype(str)
harmony_file_1['row'] = harmony_file_1['row'].astype(str)
harmony_file_1['plate_id'] = harmony_file_1['plate_id'].astype(int)






Unnamed: 0,compound_id,plate_id,well_position,row,column,source_zipfile_path,created_at,file_path_in_zip_file,database_name,database_location,...,smiles,molecular_weight,num_rule_of_5_violations,pathway,target,biological_info,cdd_compound_names,cdd_number,synonyms,collection_names
0,-999,10696883,E2,5,2,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696883/Evaluation7/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,,,,,,,,,,
1,D-0015445,10696889,N4,14,4,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696889/Evaluation9/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,CN1CCN(CC1)C1=CC=C(C(=O)NC2=NNC3=C2C=C(CC2=CC(...,560.65,2.0,Protein Tyrosine Kinase,"Trk receptor,ALK",Entrectinib (RXDX-101) is an orally bioavailab...,['Entrectinib (RXDX-101)'],CDD-2369597,"""'NMS-E628','Entrectinib (RXDX-101)'""","""'2020-0612 FUS SG full deck preliminary IC50s..."
2,-999,10696883,C47,3,47,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696883/Evaluation7/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,,,,,,,,,,
3,-999,10696883,F2,6,2,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696883/Evaluation7/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,,,,,,,,,,
4,-999,10696888,N2,14,2,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696888/Evaluation8/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15451,D-0362011,10696887,Z9,26,9,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696887/Evaluation10/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,C[C@@H](OC1=CC(=CN=C1N)C1=CN(N=C1)C1CCNCC1)C1=...,450.34,0.0,Autophagy; Protein Tyrosine Kinase/RTK,ALK; Autophagy; c-Met/HGFR; ROS,Crizotinib (PF-02341066) is an orally bioavail...,,CDD-1406937,"""'Crizotinib','PF-02341066'""","""Analogs: Alk Inhibitors"""
15452,D-0362011,10696886,Z9,26,9,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696886/Evaluation7/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,C[C@@H](OC1=CC(=CN=C1N)C1=CN(N=C1)C1CCNCC1)C1=...,450.34,0.0,Autophagy; Protein Tyrosine Kinase/RTK,ALK; Autophagy; c-Met/HGFR; ROS,Crizotinib (PF-02341066) is an orally bioavail...,,CDD-1406937,"""'Crizotinib','PF-02341066'""","""Analogs: Alk Inhibitors"""
15453,D-0362011,10696902,Z9,26,9,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696902/Evaluation23/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,C[C@@H](OC1=CC(=CN=C1N)C1=CN(N=C1)C1CCNCC1)C1=...,450.34,0.0,Autophagy; Protein Tyrosine Kinase/RTK,ALK; Autophagy; c-Met/HGFR; ROS,Crizotinib (PF-02341066) is an orally bioavail...,,CDD-1406937,"""'Crizotinib','PF-02341066'""","""Analogs: Alk Inhibitors"""
15454,D-0362011,10696885,Z9,26,9,Eval_2022-05-23_dpaint-001_Texture3.zip,2022-05-24 12:40:37.733,10696885/Evaluation9/PlateResults.txt,General Lab Phenix Plus,http://10.22.1.36/ODA/OdaService.asmx,...,C[C@@H](OC1=CC(=CN=C1N)C1=CN(N=C1)C1CCNCC1)C1=...,450.34,0.0,Autophagy; Protein Tyrosine Kinase/RTK,ALK; Autophagy; c-Met/HGFR; ROS,Crizotinib (PF-02341066) is an orally bioavail...,,CDD-1406937,"""'Crizotinib','PF-02341066'""","""Analogs: Alk Inhibitors"""


Calculate Index of Dispersion (D) 

In [5]:
# separate columns by channel, can delete channels that are not needed 


channel_column1 = str('channel' + channel1)
channel_column2 = str('channel' + channel2)
channel_column3 = str('channel' + channel3)

channels = (channel1, channel2, channel3)
channel_columns = (channel_column1, channel_column2, channel_column3)

stddev_channel1 = [str('nucleus_stddev_mean_perwell_' + channel1), str('cytoplasm_stddev_mean_perwell_' + channel1)]
stddev_channel2 = [str('nucleus_stddev_mean_perwell_' + channel2), str('cytoplasm_stddev_mean_perwell_' + channel2)]
stddev_channel3 = [str('nucleus_stddev_mean_perwell_' + channel3), str('cytoplasm_stddev_mean_perwell_' + channel3)]
mean_intensity_channel1 = [str('nucleus_intensity_mean_perwell_' + channel1),str('cytoplasm_intensity_mean_perwell_'+ channel1)]
mean_intensity_channel2 = [str('nucleus_intensity_mean_perwell_' + channel2),str('cytoplasm_intensity_mean_perwell_'+ channel2)]
mean_intensity_channel3 = [str('nucleus_intensity_mean_perwell_' + channel3),str('cytoplasm_intensity_mean_perwell_'+ channel3)]



In [7]:
#columns from original dataframe to be used in d calculations
stddev_columns = stddev_channel1 + stddev_channel2 + stddev_channel3
mean_intensity_columns = mean_intensity_channel1 + mean_intensity_channel2 + mean_intensity_channel3

# iterate through column values and create new table with d values 
d_values = []
for (i,j) in zip(stddev_columns, mean_intensity_columns):
    d = index_of_dispersion(harmony_file_1,i,j)
    d_values.append(d)
d_column_names = ['cyto_d_' + channel1,'nuc_d_'+ channel1,'nuc_d_' + channel2 , 'cyto_d_' + channel2,'nuc_d_' + channel3,'cyto_d_' + channel3]
d_values_table = pd.DataFrame(d_values).transpose()
d_values_table.columns = d_column_names

#add new Index of dispersion Columns to original table
harmony_file_with_d = pd.concat([d_values_table, harmony_file_1],axis=1)
harmony_file_with_d['column'] = harmony_file_with_d['column'].astype(str)
harmony_file_with_d['row'] = harmony_file_with_d['row'].astype(str)



In [8]:
# generate medians for CV and d
cv_channels = channels
d_channels = channels

for cv_channels in cv_channels: 
    harmony_file_with_d['median_cyto_cv_' + str(cv_channels)] = (harmony_file_with_d.groupby(['compound_name','compound_conc','channel'+ str(cv_channels),'cell_line','user_timepoint'])['cyto_cv_'+ str(cv_channels)].transform(np.median))
    harmony_file_with_d['median_cyto_d_' + str(cv_channels)] = (harmony_file_with_d.groupby(['compound_name','compound_conc','channel'+ str(cv_channels),'cell_line','user_timepoint'])['cyto_d_'+ str(cv_channels)].transform(np.median))
    harmony_file_with_d['median_nuc_cv_' + str(cv_channels)] = (harmony_file_with_d.groupby(['compound_name','compound_conc','channel'+ str(cv_channels),'cell_line','user_timepoint'])['nuc_cv_'+ str(cv_channels)].transform(np.median))
    harmony_file_with_d['median_nuc_d_' + str(cv_channels)] = (harmony_file_with_d.groupby(['compound_name','compound_conc','channel'+ str(cv_channels),'cell_line','user_timepoint'])['nuc_d_'+ str(cv_channels)].transform(np.median))
harmony_file_with_medians = harmony_file_with_d.set_index(['compound_name',channel_column1,channel_column2,channel_column3,'cell_line','compound_conc','user_timepoint','well_id'])

# # get the absolute deviations 
for d_channels in d_channels: 
    harmony_file_with_medians['ad_cyto_cv_' + str(d_channels)] =np.abs((harmony_file_with_medians['cyto_cv_' + str(d_channels)]).sub(harmony_file_with_medians['median_cyto_cv_' + str(d_channels)]))
    harmony_file_with_medians['ad_nuc_cv_' + str(d_channels)] = np.abs((harmony_file_with_medians['nuc_cv_' + str(d_channels)]).sub(harmony_file_with_medians['median_nuc_cv_' + str(d_channels)]))
    harmony_file_with_medians['ad_cyto_d_' + str(d_channels)] = np.abs((harmony_file_with_medians['cyto_d_' + str(d_channels)]).sub(harmony_file_with_medians['median_cyto_d_' + str(d_channels)]))
    harmony_file_with_medians['ad_nuc_d_' + str(d_channels)] = np.abs((harmony_file_with_medians['nuc_d_' + str(d_channels)]).sub(harmony_file_with_medians['median_cyto_d_' + str(d_channels)]))


In [9]:

ad_values =  ['ad_cyto_cv_' + channel1,'ad_cyto_cv_' + channel2,'ad_cyto_cv_' + channel3 ,'ad_nuc_cv_' + channel1 ,'ad_nuc_cv_' + channel2 ,'ad_nuc_cv_' + channel3, 'ad_cyto_d_' + channel1 ,'ad_cyto_d_' + channel2,'ad_cyto_d_' + channel3,'ad_nuc_d_' + channel1,'ad_nuc_d_' + channel2,'ad_nuc_d_' + channel3]

raw_values = ['cyto_cv_' + channel1,'cyto_cv_' + channel2,'cyto_cv_' + channel3,'nuc_cv_' + channel1, 'nuc_cv_' + channel2, 'nuc_cv_' + channel3, 'cyto_d_' + channel1, 'cyto_d_' + channel2,'cyto_d_' + channel3, 'nuc_d_' + channel1,'nuc_d_' + channel2,'nuc_d_' + channel3]

groups = [channel_column1, channel_column2, channel_column3,'cell_line','user_timepoint','compound_conc']

ids = [channel_column1, channel_column2, channel_column3, 'compound_name','cell_line','user_timepoint','compound_conc','condensate_name','cell_compartment','cell_count','well_id','control_or_sample']
df = harmony_file_with_medians.reset_index()



d = {}
for (raw_value, ad_value) in zip(raw_values, ad_values):
    d[raw_value] = score(df,ids, groups,raw_value, ad_value)
    if 'nuc' in raw_value:
        d[raw_value] = d[raw_value].loc[d[raw_value]['cell_compartment'] == 'nucleus']
    if 'cyto' in raw_value:
        d[raw_value] = d[raw_value].loc[d[raw_value]['cell_compartment'] == 'cytoplasm']
    if channel1 in raw_value:
        d[raw_value] = d[raw_value].drop(columns = [channel_column2,channel_column3]).rename(columns = {channel_column1:'marker'})
    if channel3 in raw_value:
        d[raw_value] = d[raw_value].drop(columns = [channel_column1, channel_column2]).rename(columns = {channel_column3:'marker'})
    if channel2 in raw_value: 
        d[raw_value] = d[raw_value].drop(columns = [channel_column1,channel_column3]).rename(columns = {channel_column2:'marker'})
    if 'd' in raw_value:
        d[raw_value] = d[raw_value].rename(columns = {'value': 'iod','mad_z_score': 'iod_mad_z_score'})
    if 'cv' in raw_value:
        d[raw_value] = d[raw_value].rename(columns = {'value': 'cv','mad_z_score': 'cv_mad_z_score'})


df1_cyto_cv_488:30912
dmso_only:3504
control_median:12
control_median:12
medians_and_mads:12
melted_harmony_file_by_control:30912
mad_z_score:15456
df1_cyto_cv_405:30912
dmso_only:3504
control_median:12
control_median:12
medians_and_mads:12
melted_harmony_file_by_control:30912
mad_z_score:15456
df1_cyto_cv_647:30912
dmso_only:3504
control_median:12
control_median:12
medians_and_mads:12
melted_harmony_file_by_control:30912
mad_z_score:15456
df1_nuc_cv_488:30912
dmso_only:3504
control_median:12
control_median:12
medians_and_mads:12
melted_harmony_file_by_control:30912
mad_z_score:15456
df1_nuc_cv_405:30912
dmso_only:3504
control_median:12
control_median:12
medians_and_mads:12
melted_harmony_file_by_control:30912
mad_z_score:15456
df1_nuc_cv_647:30912
dmso_only:3504
control_median:12
control_median:12
medians_and_mads:12
melted_harmony_file_by_control:30912
mad_z_score:15456
df1_cyto_d_488:30912
dmso_only:3504
control_median:12
control_median:12
medians_and_mads:12
melted_harmony_file_by_

In [10]:
data = {}
for key, dataframe in d.items():
    dataframe['median_score'] = dataframe.groupby(['compound_name','cell_line','compound_conc_x','user_timepoint','marker','well_id'])['marker_perturbation_score'].transform('median') 
#     dataframe['Summed_median_Score'] = dataframe.groupby(['compound_id','cell_line','compound_conc_x','user_timepoint'])['median_Score'].transform('median')
#     dataframe['Global_perturbation_score'] =dataframe['Summed_Score']/(len(np.unique(dataframe['marker'])))
    data[key] = dataframe.drop(columns = ['control_medians','control_mad'])

d_dataframes = [] 
cv_dataframes = []
for key in data:
    if 'd' in key:
        d_dataframes.append(data[key])
        print(str(key) + ':' + str(len(data[key])))
    if 'cv' in key:
        cv_dataframes.append(data[key]) 
        print(str(key) + ':' + str(len(data[key])))



cyto_cv_488:5152
cyto_cv_405:5152
cyto_cv_647:5152
nuc_cv_488:10304
nuc_cv_405:10304
nuc_cv_647:10304
cyto_d_488:5152
cyto_d_405:5152
cyto_d_647:5152
nuc_d_488:10304
nuc_d_405:10304
nuc_d_647:10304


In [11]:
concat_d = pd.concat(d_dataframes).rename(columns = {'marker_perturbation_score':'d_marker_perturbation_score', 'median_score':'d_median_marker_perturbation_score'}).set_index(
    ['compound_name','marker','cell_line','user_timepoint','compound_conc_x','condensate_name','cell_compartment','well_id','cell_count_x'])
concat_cv = pd.concat(cv_dataframes).rename(columns = {'marker_perturbation_score':'cv_marker_perturbation_score', 'median_score':'cv_median_marker_perturbation_score'}).set_index(
    ['compound_name','marker','cell_line','user_timepoint','compound_conc_x','condensate_name','cell_compartment','well_id','cell_count_x'])

cv_no_n2 = concat_cv[(concat_cv.control_or_sample != 'N2')]
d_no_n2 = concat_d[(concat_cv.control_or_sample != 'N2')]


In [12]:
marker_perturbation= pd.concat([cv_no_n2,d_no_n2], axis = 1).reset_index()
marker_perturbation
marker_perturbation['cv_summed_scores'] = marker_perturbation.groupby(['compound_name','cell_line','compound_conc_x','user_timepoint'])['cv_median_marker_perturbation_score'].transform('sum') 
marker_perturbation['d_summed_scores'] = marker_perturbation.groupby(['compound_name','cell_line','compound_conc_x','user_timepoint'])['d_median_marker_perturbation_score'].transform('sum') 
marker_perturbation['number_of_replicates'] = marker_perturbation.groupby(['compound_name','cell_line','compound_conc_x','user_timepoint','marker'])['d_summed_scores'].transform('count')
marker_perturbation['d_global_perturbation_score'] = marker_perturbation['d_summed_scores']/(len(np.unique(marker_perturbation['marker'])))/marker_perturbation['number_of_replicates'] #divide by the number of replicates 
marker_perturbation['cv_global_perturbation_score'] = marker_perturbation['cv_summed_scores']/(len(np.unique(marker_perturbation['marker'])))/marker_perturbation['number_of_replicates']
marker_perturbation

# marker_perturbation.to_csv('/Users/juliannalamm/Library/CloudStorage/OneDrive-SharedLibraries-DewpointTherapeutics/dpaint - dpaint_data/core_experiments/dpaint-001/data_analysis/20220608_d.paint-001_MAD_Z.csv')

Unnamed: 0,compound_name,marker,cell_line,user_timepoint,compound_conc_x,condensate_name,cell_compartment,well_id,cell_count_x,control_or_sample,...,control_or_sample.1,iod,iod_mad_z_score,d_marker_perturbation_score,d_median_marker_perturbation_score,cv_summed_scores,d_summed_scores,number_of_replicates,d_global_perturbation_score,cv_global_perturbation_score
0,Prosetin 12K,PCNT,HeLa,6hr,4.1700,Centrosome,cytoplasm,10696885_Z21,170,S,...,S,1010.695241,-0.147783,0.0,0.0,17.0,7.0,3,0.093333,0.226667
1,Prosetin 12K,PCNT,HeLa,6hr,0.2900,Centrosome,cytoplasm,10696885_AA36,165,S,...,S,701.173312,-2.997803,0.5,0.5,10.5,12.0,3,0.160000,0.140000
2,YM-155,PCNT,HeLa,6hr,0.0420,Centrosome,cytoplasm,10696885_R35,188,S,...,S,1472.312990,4.102707,0.5,0.5,19.5,8.5,3,0.113333,0.260000
3,K03861,PCNT,HeLa,6hr,0.2900,Centrosome,cytoplasm,10696885_AB43,187,S,...,S,431.516097,-5.480756,1.0,1.0,17.0,11.0,3,0.146667,0.226667
4,Rapamycyin,PCNT,HeLa,6hr,0.0042,Centrosome,cytoplasm,10696885_M33,186,S,...,S,1100.389647,0.678106,0.0,0.0,13.0,6.0,3,0.080000,0.173333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44347,Staurosporine,FIB1,HeLa,6hr,29.1700,nucleolus,nucleus,10696902_Z7,0,S,...,S,,,,,84.5,44.5,3,0.593333,1.126667
44348,Crizotinib,EWSR1,HeLa,6hr,29.1700,paraspeckle,nucleus,10696887_Z9,0,S,...,S,,,,,49.5,32.0,3,0.426667,0.660000
44349,Crizotinib,CBX8,HeLa,6hr,29.1700,heterochromatin,nucleus,10696886_Z9,0,S,...,S,,,,,49.5,32.0,3,0.426667,0.660000
44350,Crizotinib,FIB1,HeLa,6hr,29.1700,nucleolus,nucleus,10696902_Z9,0,S,...,S,,,,,49.5,32.0,3,0.426667,0.660000
