In [1]:
import pandas as pd

Replicability

In [2]:
replicability_pvalue = pd.read_csv("output/replicability_pvalue.csv")
replicability_cp_dp_pvalue = replicability_pvalue.query(
    'Feature_set!="DP-CP"'
).reset_index(drop=True)
replicability_dp_vs_cp_pvalue = replicability_pvalue.query(
    'Feature_set=="DP-CP"'
).reset_index(drop=True)

replicability_fp = pd.concat(
    [
        pd.read_csv("output/cellprofiler_replicability_fp.csv").assign(
            Feature_set="CellProfiler"
        ),
        pd.read_csv("output/deepprofiler_replicability_fp.csv").assign(
            Feature_set="DeepProfiler"
        ),
    ],
    join="inner",
)

In [3]:
print(
    replicability_cp_dp_pvalue.merge(
        replicability_fp, on=["Feature_set", "Modality", "Cell", "time"]
    )
    .sort_values(by=["Feature_set", "Modality", "Cell", "time"])
    .to_markdown(index=False)
)

| t-test        | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |    fp |
|:--------------|:--------------|:-----------|:-------|:-------|------------:|:---------|------:|
| Replicability | CellProfiler  | compound   | A549   | long   |    31.517   | <0.05    | 0.922 |
| Replicability | CellProfiler  | compound   | A549   | short  |    15.401   | <0.05    | 0.729 |
| Replicability | CellProfiler  | compound   | U2OS   | long   |    11.9764  | <0.05    | 0.641 |
| Replicability | CellProfiler  | compound   | U2OS   | short  |    17.4787  | <0.05    | 0.771 |
| Replicability | CellProfiler  | crispr     | A549   | long   |    10.5586  | <0.05    | 0.662 |
| Replicability | CellProfiler  | crispr     | A549   | short  |    10.1071  | <0.05    | 0.659 |
| Replicability | CellProfiler  | crispr     | U2OS   | long   |     3.61722 | <0.05    | 0.518 |
| Replicability | CellProfiler  | crispr     | U2OS   | short  |     9.71768 | <0.05    | 0.662 |
| Replicability | Ce

In [4]:
print(
    replicability_dp_vs_cp_pvalue.sort_values(
        by=["Feature_set", "Modality", "Cell", "time"]
    ).to_markdown(index=False)
)

| t-test        | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |
|:--------------|:--------------|:-----------|:-------|:-------|------------:|:---------|
| Replicability | DP-CP         | compound   | U2OS   | long   |     6.19497 | <0.05    |
| Replicability | DP-CP         | crispr     | U2OS   | long   |    14.2519  | <0.05    |
| Replicability | DP-CP         | orf        | U2OS   | long   |     8.70731 | <0.05    |


Same perturbation matching

In [5]:
matching_pvalue = pd.read_csv("output/matching_pvalue.csv")
matching_cp_dp_pvalue = matching_pvalue.query('Feature_set!="DP-CP"').reset_index(
    drop=True
)
matching_dp_vs_cp_pvalue = matching_pvalue.query('Feature_set=="DP-CP"').reset_index(
    drop=True
)

matching_fp = pd.concat(
    [
        pd.read_csv("output/cellprofiler_matching_fp.csv").assign(
            Feature_set="CellProfiler"
        ),
        pd.read_csv("output/deepprofiler_matching_fp.csv").assign(
            Feature_set="DeepProfiler"
        ),
    ],
    join="inner",
)

In [6]:
print(
    matching_cp_dp_pvalue.merge(
        matching_fp, on=["Feature_set", "Modality", "Cell", "time"]
    )
    .sort_values(by=["Feature_set", "Modality", "Cell", "time"])
    .to_markdown(index=False)
)

| t-test                          | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |    fp |
|:--------------------------------|:--------------|:-----------|:-------|:-------|------------:|:---------|------:|
| Same perturbation type matching | CellProfiler  | compound   | A549   | long   |    1.26224  | 0.21     | 0.278 |
| Same perturbation type matching | CellProfiler  | compound   | A549   | short  |    0.723932 | 0.47     | 0.284 |
| Same perturbation type matching | CellProfiler  | compound   | U2OS   | long   |    4.8252   | <0.05    | 0.42  |
| Same perturbation type matching | CellProfiler  | compound   | U2OS   | short  |    2.13019  | <0.05    | 0.388 |
| Same perturbation type matching | CellProfiler  | crispr     | A549   | long   |   -1.29781  | 0.20     | 0.203 |
| Same perturbation type matching | CellProfiler  | crispr     | A549   | short  |   -1.62145  | 0.11     | 0.162 |
| Same perturbation type matching | CellProfiler  | crispr     | U2OS   

In [7]:
print(
    matching_dp_vs_cp_pvalue.sort_values(
        by=["Feature_set", "Modality", "Cell", "time"]
    ).to_markdown(index=False)
)

| t-test                          | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |
|:--------------------------------|:--------------|:-----------|:-------|:-------|------------:|:---------|
| Same perturbation type matching | DP-CP         | compound   | U2OS   | long   |    -7.06974 | <0.05    |
| Same perturbation type matching | DP-CP         | crispr     | U2OS   | long   |    -1.70548 | 0.09     |


Different perturbation matching

In [8]:
gene_compound_matching_pvalue = pd.read_csv("output/gene_compound_matching_pvalue.csv")
gene_compound_matching_cp_dp_pvalue = gene_compound_matching_pvalue.query(
    'Feature_set!="DP-CP"'
).reset_index(drop=True)
gene_compound_matching_dp_vs_cp_pvalue = gene_compound_matching_pvalue.query(
    'Feature_set=="DP-CP"'
).reset_index(drop=True)

gene_compound_matching_fp = pd.concat(
    [
        pd.read_csv("output/cellprofiler_gene_compound_matching_fp.csv").assign(
            Feature_set="CellProfiler"
        ),
        pd.read_csv("output/deepprofiler_gene_compound_matching_fp.csv").assign(
            Feature_set="DeepProfiler"
        ),
    ],
    join="inner",
)

In [9]:
print(
    gene_compound_matching_cp_dp_pvalue.merge(
        gene_compound_matching_fp,
        on=["Feature_set", "Modality1", "Modality2", "Cell"],
    )
    .sort_values(by=["Feature_set", "Modality1", "Modality2", "Cell"])
    .to_markdown(index=False)
)

| t-test                 | Feature_set   | Modality1      | Modality2    | Cell   |   statistic | pvalue   |    fp |
|:-----------------------|:--------------|:---------------|:-------------|:-------|------------:|:---------|------:|
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_long  | A549   |    -9.44963 | <0.05    | 0.137 |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_long  | U2OS   |    -6.9846  | <0.05    | 0.121 |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_short | A549   |   -12.3498  | <0.05    | 0.092 |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_short | U2OS   |    -8.54345 | <0.05    | 0.087 |
| Compoung-gene matching | CellProfiler  | compound_long  | orf_long     | A549   |   -11.9376  | <0.05    | 0.04  |
| Compoung-gene matching | CellProfiler  | compound_long  | orf_long     | U2OS   |   -10.5517  | <0.05    | 0.047 |
| Compoung-gene matching | CellProfiler  | compound_long  | orf_

In [10]:
print(
    gene_compound_matching_dp_vs_cp_pvalue.sort_values(
        by=["Feature_set", "Modality1", "Modality2", "Cell"]
    ).to_markdown(index=False)
)

| t-test                 | Feature_set   | Modality1     | Modality2   | Cell   |   statistic | pvalue   |
|:-----------------------|:--------------|:--------------|:------------|:-------|------------:|:---------|
| Compoung-gene matching | DP-CP         | compound_long | crispr_long | U2OS   |    -1.04339 | 0.30     |
| Compoung-gene matching | DP-CP         | compound_long | orf_long    | U2OS   |     3.21301 | <0.05    |


Different pertubation matching CRISPR vs. ORF

In [11]:
gene_compound_matching_crispr_orf_pvalue = pd.read_csv("output/crispr_orf_pvalue.csv")

In [12]:
print(
    gene_compound_matching_crispr_orf_pvalue.sort_values(
        by=["Feature_set", "compound-crispr", "compound-orf", "Cell"]
    )[
        [
            "t-test",
            "Feature_set",
            "compound-crispr",
            "compound-orf",
            "Cell",
            "statistic",
            "pvalue",
        ]
    ].to_markdown(
        index=False
    )
)

| t-test                                  | Feature_set   | compound-crispr             | compound-orf             | Cell   |   statistic | pvalue   |
|:----------------------------------------|:--------------|:----------------------------|:-------------------------|:-------|------------:|:---------|
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_long   | compound_long-orf_long   | A549   |    8.07241  | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_long   | compound_long-orf_long   | U2OS   |    3.39236  | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_long   | compound_long-orf_short  | A549   |    7.87323  | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_long   | compound_long-orf_short  | U2OS   |    1.17604  | 0.24     |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | compound_long-crispr_short  | comp

Compound-genetic perturbation cosine similarity lists

In [13]:
cols = [
    "Cell",
    "Genetic_Perturbation",
    "pert_iname",
    "Metadata_matching_target",
    "moa_list",
    "cosine_sim",
]

cosine_sim = pd.read_csv("output/compound_genetic_perturbation_cosine_similarity.csv")
metadata = (
    pd.read_csv(
        "../metadata/external_metadata/JUMP-Target-1_compound_metadata.tsv",
        sep="\t",
        usecols=["broad_sample", "pert_iname"],
    )
    .merge(
        pd.read_csv(
            "input/JUMP-Target-1_compound_metadata_additional_annotations.tsv",
            sep="\t",
            usecols=["broad_sample", "moa_list"],
        )
    )
    .rename(columns={"broad_sample": "Metadata_broad_sample"})
)

cosine_sim = cosine_sim.merge(metadata, on="Metadata_broad_sample")

for cell_type in cosine_sim.Cell.unique():
    filtered_df = cosine_sim.query("Cell==@cell_type")
    top_df = filtered_df.nlargest(10, "cosine_sim")
    bottom_df = filtered_df.nsmallest(10, "cosine_sim").sort_values(
        by="cosine_sim", ascending=False
    )
    df = pd.concat([top_df, bottom_df], join="inner")
    print(df[cols].to_markdown(index=False))

| Cell   | Genetic_Perturbation   | pert_iname    | Metadata_matching_target   | moa_list                                  |   cosine_sim |
|:-------|:-----------------------|:--------------|:---------------------------|:------------------------------------------|-------------:|
| A549   | CRISPR                 | AMG900        | AURKB                      | Aurora kinase inhibitor                   |     0.775409 |
| A549   | CRISPR                 | BI-2536       | PLK1                       | PLK inhibitor                             |     0.77319  |
| A549   | CRISPR                 | NSC-663284    | CDC25A                     | CDC inhibitor                             |     0.770165 |
| A549   | CRISPR                 | BI-2536       | PLK1                       | PLK inhibitor                             |     0.762776 |
| A549   | CRISPR                 | AMG900        | AURKB                      | Aurora kinase inhibitor                   |     0.756971 |
| A549   | CRISPR   