In [1]:
import pandas as pd

Replicability

In [2]:
replicability_pvalue = pd.read_csv("output/replicability_pvalue.csv")
replicability_cp_dp_pvalue = replicability_pvalue.query(
    'Feature_set!="DP-CP"'
).reset_index(drop=True)
replicability_dp_vs_cp_pvalue = replicability_pvalue.query(
    'Feature_set=="DP-CP"'
).reset_index(drop=True)

replicability_mmap = pd.concat(
    [
        pd.read_csv("output/cellprofiler_replicability_mmap.csv").assign(
            Feature_set="CellProfiler"
        ),
        pd.read_csv("output/deepprofiler_replicability_mmap.csv").assign(
            Feature_set="DeepProfiler"
        ),
    ],
    join="inner",
)

In [3]:
print(
    replicability_cp_dp_pvalue.merge(
        replicability_mmap, on=["Feature_set", "Modality", "Cell", "time"]
    )
    .sort_values(by=["Feature_set", "Modality", "Cell", "time"])
    .to_markdown(index=False)
)

| t-test        | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |   mmAP |
|:--------------|:--------------|:-----------|:-------|:-------|------------:|:---------|-------:|
| Replicability | CellProfiler  | compound   | A549   | long   |     38.2235 | <0.05    |  0.703 |
| Replicability | CellProfiler  | compound   | A549   | short  |     25.0989 | <0.05    |  0.531 |
| Replicability | CellProfiler  | compound   | U2OS   | long   |     22.1072 | <0.05    |  0.495 |
| Replicability | CellProfiler  | compound   | U2OS   | short  |     25.6631 | <0.05    |  0.55  |
| Replicability | CellProfiler  | crispr     | A549   | long   |     23.3573 | <0.05    |  0.393 |
| Replicability | CellProfiler  | crispr     | A549   | short  |     23.0591 | <0.05    |  0.372 |
| Replicability | CellProfiler  | crispr     | U2OS   | long   |     20.6414 | <0.05    |  0.254 |
| Replicability | CellProfiler  | crispr     | U2OS   | short  |     21.2194 | <0.05    |  0.333 |
| Replicab

In [4]:
print(
    replicability_dp_vs_cp_pvalue.sort_values(
        by=["Feature_set", "Modality", "Cell", "time"]
    ).to_markdown(index=False)
)

| t-test        | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |
|:--------------|:--------------|:-----------|:-------|:-------|------------:|:---------|
| Replicability | DP-CP         | compound   | U2OS   | long   |     1.25053 | 0.21     |
| Replicability | DP-CP         | crispr     | U2OS   | long   |     4.49756 | <0.05    |
| Replicability | DP-CP         | orf        | U2OS   | long   |     2.78749 | <0.05    |


Same perturbation matching

In [5]:
matching_pvalue = pd.read_csv("output/matching_pvalue.csv")
matching_cp_dp_pvalue = matching_pvalue.query('Feature_set!="DP-CP"').reset_index(
    drop=True
)
matching_dp_vs_cp_pvalue = matching_pvalue.query('Feature_set=="DP-CP"').reset_index(
    drop=True
)

matching_mmap = pd.concat(
    [
        pd.read_csv("output/cellprofiler_matching_mmap.csv").assign(
            Feature_set="CellProfiler"
        ),
        pd.read_csv("output/deepprofiler_matching_mmap.csv").assign(
            Feature_set="DeepProfiler"
        ),
    ],
    join="inner",
)

In [6]:
print(
    matching_cp_dp_pvalue.merge(
        matching_mmap, on=["Feature_set", "Modality", "Cell", "time"]
    )
    .sort_values(by=["Feature_set", "Modality", "Cell", "time"])
    .to_markdown(index=False)
)

| t-test                          | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |   mmAP |
|:--------------------------------|:--------------|:-----------|:-------|:-------|------------:|:---------|-------:|
| Same perturbation type matching | CellProfiler  | compound   | A549   | long   |    10.9722  | <0.05    |  0.12  |
| Same perturbation type matching | CellProfiler  | compound   | A549   | short  |    10.9711  | <0.05    |  0.108 |
| Same perturbation type matching | CellProfiler  | compound   | U2OS   | long   |    11.4203  | <0.05    |  0.11  |
| Same perturbation type matching | CellProfiler  | compound   | U2OS   | short  |    11.6574  | <0.05    |  0.081 |
| Same perturbation type matching | CellProfiler  | crispr     | A549   | long   |     4.92739 | <0.05    |  0.088 |
| Same perturbation type matching | CellProfiler  | crispr     | A549   | short  |     4.29482 | <0.05    |  0.062 |
| Same perturbation type matching | CellProfiler  | crispr     |

In [7]:
print(
    matching_dp_vs_cp_pvalue.sort_values(
        by=["Feature_set", "Modality", "Cell", "time"]
    ).to_markdown(index=False)
)

| t-test                          | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |
|:--------------------------------|:--------------|:-----------|:-------|:-------|------------:|:---------|
| Same perturbation type matching | DP-CP         | compound   | U2OS   | long   |   -4.2832   | <0.05    |
| Same perturbation type matching | DP-CP         | crispr     | U2OS   | long   |    0.210606 | 0.83     |


Different perturbation matching

In [8]:
gene_compound_matching_pvalue = pd.read_csv("output/gene_compound_matching_pvalue.csv")
gene_compound_matching_cp_dp_pvalue = gene_compound_matching_pvalue.query(
    'Feature_set!="DP-CP"'
).reset_index(drop=True)
gene_compound_matching_dp_vs_cp_pvalue = gene_compound_matching_pvalue.query(
    'Feature_set=="DP-CP"'
).reset_index(drop=True)

gene_compound_matching_mmap = pd.concat(
    [
        pd.read_csv("output/cellprofiler_gene_compound_matching_mmap.csv").assign(
            Feature_set="CellProfiler"
        ),
        pd.read_csv("output/deepprofiler_gene_compound_matching_mmap.csv").assign(
            Feature_set="DeepProfiler"
        ),
    ],
    join="inner",
)

In [9]:
print(
    gene_compound_matching_cp_dp_pvalue.merge(
        gene_compound_matching_mmap,
        on=["Feature_set", "Modality1", "Modality2", "Cell"],
    )
    .sort_values(by=["Feature_set", "Modality1", "Modality2", "Cell"])
    .to_markdown(index=False)
)

| t-test                 | Feature_set   | Modality1      | Modality2    | Cell   |   statistic | pvalue   |   mmAP |
|:-----------------------|:--------------|:---------------|:-------------|:-------|------------:|:---------|-------:|
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_long  | A549   |     5.53934 | <0.05    |  0.032 |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_long  | U2OS   |     5.74079 | <0.05    |  0.033 |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_short | A549   |     5.29455 | <0.05    |  0.032 |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_short | U2OS   |     3.67899 | <0.05    |  0.025 |
| Compoung-gene matching | CellProfiler  | compound_long  | orf_long     | A549   |     5.00785 | <0.05    |  0.03  |
| Compoung-gene matching | CellProfiler  | compound_long  | orf_long     | U2OS   |     6.06296 | <0.05    |  0.043 |
| Compoung-gene matching | CellProfiler  | compound_long

In [10]:
print(
    gene_compound_matching_dp_vs_cp_pvalue.sort_values(
        by=["Feature_set", "Modality1", "Modality2", "Cell"]
    ).to_markdown(index=False)
)

| t-test                 | Feature_set   | Modality1     | Modality2   | Cell   |   statistic |   pvalue |
|:-----------------------|:--------------|:--------------|:------------|:-------|------------:|---------:|
| Compoung-gene matching | DP-CP         | compound_long | crispr_long | U2OS   |   -0.696575 |     0.49 |
| Compoung-gene matching | DP-CP         | compound_long | orf_long    | U2OS   |   -0.640213 |     0.52 |


Different pertubation matching CRISPR vs. ORF

In [11]:
gene_compound_matching_crispr_orf_pvalue = pd.read_csv("output/crispr_orf_pvalue.csv")

In [12]:
print(
    gene_compound_matching_crispr_orf_pvalue.sort_values(
        by=["Feature_set", "compound-crispr", "compound-orf", "Cell"]
    )[
        [
            "t-test",
            "Feature_set",
            "compound-crispr",
            "compound-orf",
            "Cell",
            "statistic",
            "pvalue",
        ]
    ].to_markdown(
        index=False
    )
)

| t-test                                  | Feature_set   | compound-crispr             | compound-orf             | Cell   |   statistic | pvalue   |
|:----------------------------------------|:--------------|:----------------------------|:-------------------------|:-------|------------:|:---------|
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_long   | compound_long-orf_long   | A549   |  0.283919   | 0.78     |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_long   | compound_long-orf_long   | U2OS   | -1.03495    | 0.30     |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_long   | compound_long-orf_short  | A549   |  0.00655953 | 0.99     |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_long   | compound_long-orf_short  | U2OS   | -1.71964    | 0.09     |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_short  | comp

Compound-genetic perturbation cosine similarity lists

In [13]:
cols = [
    "Cell",
    "Genetic_Perturbation",
    "pert_iname",
    "Metadata_matching_target",
    "moa_list",
    "cosine_sim",
]

cosine_sim = pd.read_csv("output/compound_genetic_perturbation_cosine_similarity.csv")
metadata = (
    pd.read_csv(
        "../metadata/external_metadata/JUMP-Target-1_compound_metadata.tsv",
        sep="\t",
        usecols=["broad_sample", "pert_iname"],
    )
    .merge(
        pd.read_csv(
            "input/JUMP-Target-1_compound_metadata_additional_annotations.tsv",
            sep="\t",
            usecols=["broad_sample", "moa_list"],
        )
    )
    .rename(columns={"broad_sample": "Metadata_broad_sample"})
)

cosine_sim = cosine_sim.merge(metadata, on="Metadata_broad_sample")

for cell_type in cosine_sim.Cell.unique():
    filtered_df = cosine_sim.query("Cell==@cell_type")
    top_df = filtered_df.nlargest(10, "cosine_sim")
    bottom_df = filtered_df.nsmallest(10, "cosine_sim").sort_values(
        by="cosine_sim", ascending=False
    )
    df = pd.concat([top_df, bottom_df], join="inner")
    print(df[cols].to_markdown(index=False))

| Cell   | Genetic_Perturbation   | pert_iname    | Metadata_matching_target   | moa_list                                  |   cosine_sim |
|:-------|:-----------------------|:--------------|:---------------------------|:------------------------------------------|-------------:|
| A549   | CRISPR                 | AMG900        | AURKB                      | Aurora kinase inhibitor                   |     0.775409 |
| A549   | CRISPR                 | BI-2536       | PLK1                       | PLK inhibitor                             |     0.77319  |
| A549   | CRISPR                 | NSC-663284    | CDC25A                     | CDC inhibitor                             |     0.770165 |
| A549   | CRISPR                 | BI-2536       | PLK1                       | PLK inhibitor                             |     0.762776 |
| A549   | CRISPR                 | AMG900        | AURKB                      | Aurora kinase inhibitor                   |     0.756971 |
| A549   | CRISPR   