In [1]:
import pandas as pd
import utils
import numpy as np
import plotly.express as px

np.random.seed(9000)

TODO

In this notebook, we calculate the `Percent Matching` score between chemical and genetic perturbations. The following are the steps taken
1. Normalized, feature selected ORF, CRISPR and Compound profiles are read and the replicate plates are merged into a single dataframe, for each time point and cell line.
2. All the target annotations of the compounds are used in this analysis.
3. Negative control and empty wells are removed from the dataframe.
4. The signal distribution, which is the median pairwise correlation between each Compound-CRISPR or Compound-ORF pair (that target the same gene), is computed.
5. The null distribution, which is the median pairwise correlation of Compound-CRISPRs or Compound-ORF that target different genes, is computed for 1000 combinations of Compound-CRISPRs or Compound-ORFs.
6. Percent Matching is computed as the percentage of the signal distribution that is the greater than the 95th percentile of null distribution
7. The signal and noise distributions and the Percent Matching values are plotted and the table of Percent Matching is printed.

In [2]:
mean_average_precision_df = pd.DataFrame()
batch = "2020_11_04_CPJUMP1"

experiment_df = (
    pd.read_csv('output/experiment-metadata.tsv', sep='\t')
    .query('Batch==@batch')
    .query('Density=="100"')
    .query('Antibiotics=="absent"')
)

experiment_df.drop(experiment_df[(experiment_df.Perturbation == "compound") & (experiment_df.Cell_line == "Cas9")].index, inplace=True)

target1_metadata = (
    pd.read_csv('input/JUMP-Target-1_compound_metadata_additional_annotations.tsv', sep='\t', usecols=['broad_sample', 'target_list'])
    .rename(columns={'broad_sample':'Metadata_broad_sample', 'target_list':'Metadata_target_list'})
)

In [None]:
for cell in experiment_df.Cell_type.unique():
    cell_df = experiment_df.query('Cell_type==@cell')
    modality_1_perturbation = "compound"
    modality_1_experiments_df = cell_df.query('Perturbation==@modality_1_perturbation')
    for modality_1_timepoint in modality_1_experiments_df.Time.unique():
        modality_1_timepoint_df = modality_1_experiments_df.query('Time==@modality_1_timepoint')
        modality_1_df = pd.DataFrame()
        for plate in modality_1_timepoint_df.Assay_Plate_Barcode.unique():
            data_df = (
                utils.load_data(batch, plate, "normalized_feature_select_negcon_batch.csv.gz")
                .assign(Metadata_modality=modality_1_perturbation)
            )
            modality_1_df = utils.concat_profiles(modality_1_df, data_df)

        modality_1_df = utils.remove_negcon_empty_wells(modality_1_df)
        modality_1_consensus_df = utils.consensus(modality_1_df, "Metadata_broad_sample")

        modality_1_consensus_df = (
            modality_1_consensus_df.merge(target1_metadata, on='Metadata_broad_sample', how='left')
            .assign(Metadata_gene=lambda x: x.Metadata_target_list.str.split('|'))
            .assign(Metadata_gene_list=lambda x: x.Metadata_target_list.str.split('|'))
            .explode('Metadata_gene')
            .reset_index(drop=True)
        )

        modality_1_consensus_shuffled_df = utils.shuffle_profiles(modality_1_consensus_df)

        both_modality_2_experiments_df = cell_df.query('Perturbation!=@modality_1_perturbation')
        for modality_2_perturbation in both_modality_2_experiments_df.Perturbation.unique():
            modality_2_experiments_df = both_modality_2_experiments_df.query('Perturbation==@modality_2_perturbation')
            for modality_2_timepoint in modality_2_experiments_df.Time.unique():
                modality_2_timepoint_df = modality_2_experiments_df.query('Time==@modality_2_timepoint')
                modality_2_df = pd.DataFrame()
                for plate in modality_2_timepoint_df.Assay_Plate_Barcode.unique():
                    data_df = (
                        utils.load_data(batch, plate, "normalized_feature_select_negcon_batch.csv.gz")
                        .assign(Metadata_modality=modality_2_perturbation)
                        .assign(Metadata_gene_list=lambda x: x.Metadata_gene)
                    )
                    modality_2_df = utils.concat_profiles(modality_1_df, data_df)

                modality_2_df = utils.remove_negcon_empty_wells(modality_2_df)

                modality_2_consensus_df = utils.consensus(modality_2_df, "Metadata_broad_sample")
                modality_2_consensus_shuffled_df = utils.shuffle_profiles(modality_2_consensus_df)

                modality_2_genes = list(np.unique(list(modality_2_consensus_df.Metadata_gene)))
                modality_1_consensus_filtered_df = modality_1_consensus_df.query('Metadata_gene==@modality_2_genes').reset_index(drop=True)
                modality_1_consensus_shuffled_filtered_df = modality_1_consensus_shuffled_df.query('Metadata_gene==@modality_2_genes').reset_index(drop=True)

                score = utils.MeanAveragePrecision(modality_1_consensus_filtered_df, modality_2_consensus_df, "Metadata_gene")

                mean_average_precision_df = mean_average_precision_df.append({'Description':f'{modality_1_perturbation}_{cell}_{modality_1_timepoint} vs. {modality_2_perturbation}_{cell}_{modality_2_timepoint}',
                                                                              'mAP':f'{score.map:.3f}',
                                                                              'Matches':'True'}, ignore_index=True)

                score_shuffled = utils.MeanAveragePrecision(modality_1_consensus_shuffled_filtered_df, modality_2_consensus_shuffled_df, "Metadata_gene")

                mean_average_precision_df = mean_average_precision_df.append({'Description':f'{modality_1_perturbation}_{cell}_{modality_1_timepoint} vs. {modality_2_perturbation}_{cell}_{modality_2_timepoint}',
                                                                              'mAP':f'{score_shuffled.map:.3f}',
                                                                              'Matches':'Shuffled'}, ignore_index=True)


In [None]:
print(mean_average_precision_df[['Description','Matches', 'mAP']].query('Matches=="True"').to_markdown(index=False))

In [None]:
print(mean_average_precision_df[['Description','Matches', 'mAP']].query('Matches=="Shuffled"').to_markdown(index=False))

In [None]:
mean_average_precision_df[['Chemical_Perturbation','Genetic_Perturbation']] = mean_average_precision_df.Description.str.split(' vs. ', expand=True,)
mean_average_precision_df['mAP'] = mean_average_precision_df['mAP'].astype(float)
pivot_table = (
    mean_average_precision_df
        .query('Matches=="True"')
        .pivot('Chemical_Perturbation','Genetic_Perturbation','mAP')
)
fig = px.imshow(pivot_table, color_continuous_scale='blues')
fig.update_layout(title='mAP vs. genes',
                  xaxis=dict(title='Genetic Perturbations'),
                  yaxis=dict(title='Compounds'),
                  coloraxis_colorbar_x=+1.15)
fig.show("png")
fig.write_image(f'figures/11.mAP_heatmap.png', width=640, height=480, scale=2)

In [None]:
print(mean_average_precision_df[['Chemical_Perturbation','Genetic_Perturbation', 'Matches', 'mAP']].to_markdown(index=False))

In [None]:
fig = px.bar(data_frame=mean_average_precision_df,
             x='Description',
             y='mAP',
             barmode='overlay',
             color='Matches')
fig.update_layout(title='mAP vs. Perturbation matching',
                  yaxis=dict(title='Perturbation matching'))
fig.update_xaxes(tickangle=45)
fig.show("png")
fig.write_image(f'figures/11.mAP_matching.png', width=640, height=480, scale=2)