In [1]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
import pandas as pd
import scipy.stats as sps
import utils

Perform t-tests for the following scenarios to see whether the mAP values are different
- perturbation identification and matching vs. baseline, separately for CellProfiler and DeepProfiler features.
- CellProfiler features vs. DeepProfiler features.
- ORF vs. CRISPR matching to compounds, separately for CellProfiler and DeepProfiler features. 

In [2]:
cp_replicability_df = pd.read_csv("output/cellprofiler_replicability_map.csv")
dp_replicability_df = pd.read_csv("output/deepprofiler_replicability_map.csv")
cp_matching_df = pd.read_csv("output/cellprofiler_matching_map.csv")
dp_matching_df = pd.read_csv("output/deepprofiler_matching_map.csv")
cp_gene_compound_matching_df = pd.read_csv(
    "output/cellprofiler_gene_compound_matching_map.csv"
)
dp_gene_compound_matching_df = pd.read_csv(
    "output/deepprofiler_gene_compound_matching_map.csv"
)

replicability_df = pd.DataFrame()
matching_df = pd.DataFrame()
gene_compound_matching_df = pd.DataFrame()
crispr_orf_df = pd.DataFrame()

replicability_experimental_variables = ("Modality", "Cell", "time")
matching_experimental_variables = ("Modality", "Cell", "time")
gene_compound_matching_experimental_variables = ("Modality1", "Modality2", "Cell")
crispr_orf_experimental_variables = ["Modality1", "Cell"]

### Perturbation detection mAP
- vs. baseline
- CellProfiler vs. DeepProfiler

In [3]:
for experiment, experiment_df in cp_replicability_df.groupby(
    list(replicability_experimental_variables)
):
    query_string = f'{replicability_experimental_variables[0]}=="{experiment[0]}" and \
        {replicability_experimental_variables[1]}=="{experiment[1]}" and \
            {replicability_experimental_variables[2]}=="{experiment[2]}"'

    ttest_1sample = sps.ttest_1samp(experiment_df.mAP.values, 0)

    replicability_df = replicability_df.append(
        {
            "t-test": "Replicability",
            "Feature_set": "CellProfiler",
            f"{replicability_experimental_variables[0]}": f"{experiment[0]}",
            f"{replicability_experimental_variables[1]}": f"{experiment[1]}",
            f"{replicability_experimental_variables[2]}": f"{experiment[2]}",
            "statistic": f"{ttest_1sample.statistic}",
            "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
        },
        ignore_index=True,
    )

    if len(dp_replicability_df.query(query_string)) > 0:
        ttest_1sample = sps.ttest_1samp(
            dp_replicability_df.query(query_string).mAP.values, 0
        )
        replicability_df = replicability_df.append(
            {
                "t-test": "Replicability",
                "Feature_set": "DeepProfiler",
                f"{replicability_experimental_variables[0]}": f"{experiment[0]}",
                f"{replicability_experimental_variables[1]}": f"{experiment[1]}",
                f"{replicability_experimental_variables[2]}": f"{experiment[2]}",
                "statistic": f"{ttest_1sample.statistic}",
                "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
            },
            ignore_index=True,
        )

        ttest_2sample = sps.ttest_ind(
            dp_replicability_df.query(query_string).mAP.values, experiment_df.mAP.values
        )

        replicability_df = replicability_df.append(
            {
                "t-test": "Replicability",
                "Feature_set": "DP-CP",
                f"{replicability_experimental_variables[0]}": f"{experiment[0]}",
                f"{replicability_experimental_variables[1]}": f"{experiment[1]}",
                f"{replicability_experimental_variables[2]}": f"{experiment[2]}",
                "statistic": f"{ttest_2sample.statistic}",
                "pvalue": f"{utils.convert_pvalue(ttest_2sample.pvalue)}",
            },
            ignore_index=True,
        )

In [4]:
print(
    replicability_df[
        ["t-test", "Feature_set"]
        + list(replicability_experimental_variables)
        + ["statistic", "pvalue"]
    ].to_markdown(index=False)
)

| t-test        | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |
|:--------------|:--------------|:-----------|:-------|:-------|------------:|:---------|
| Replicability | CellProfiler  | compound   | A549   | long   |    31.517   | <0.05    |
| Replicability | CellProfiler  | compound   | A549   | short  |    15.401   | <0.05    |
| Replicability | CellProfiler  | compound   | U2OS   | long   |    11.9764  | <0.05    |
| Replicability | DeepProfiler  | compound   | U2OS   | long   |    26.7891  | <0.05    |
| Replicability | DP-CP         | compound   | U2OS   | long   |     6.19497 | <0.05    |
| Replicability | CellProfiler  | compound   | U2OS   | short  |    17.4787  | <0.05    |
| Replicability | CellProfiler  | crispr     | A549   | long   |    10.5586  | <0.05    |
| Replicability | CellProfiler  | crispr     | A549   | short  |    10.1071  | <0.05    |
| Replicability | CellProfiler  | crispr     | U2OS   | long   |     3.61722 | <0.05    |
| Replicab

### Within perturbation matching mAP 
- vs. baseline
- CellProfiler vs. DeepProfiler

In [5]:
for experiment, experiment_df in cp_matching_df.groupby(
    list(matching_experimental_variables)
):
    query_string = f'{matching_experimental_variables[0]}=="{experiment[0]}" and \
        {matching_experimental_variables[1]}=="{experiment[1]}" and \
            {matching_experimental_variables[2]}=="{experiment[2]}"'

    ttest_1sample = sps.ttest_1samp(experiment_df.mAP.values, 0)

    matching_df = matching_df.append(
        {
            "t-test": "Same perturbation type matching",
            "Feature_set": "CellProfiler",
            f"{matching_experimental_variables[0]}": f"{experiment[0]}",
            f"{matching_experimental_variables[1]}": f"{experiment[1]}",
            f"{matching_experimental_variables[2]}": f"{experiment[2]}",
            "statistic": f"{ttest_1sample.statistic}",
            "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
        },
        ignore_index=True,
    )

    if len(dp_matching_df.query(query_string)) > 0:
        ttest_1sample = sps.ttest_1samp(
            dp_matching_df.query(query_string).mAP.values, 0
        )
        matching_df = matching_df.append(
            {
                "t-test": "Same perturbation type matching",
                "Feature_set": "DeepProfiler",
                f"{matching_experimental_variables[0]}": f"{experiment[0]}",
                f"{matching_experimental_variables[1]}": f"{experiment[1]}",
                f"{matching_experimental_variables[2]}": f"{experiment[2]}",
                "statistic": f"{ttest_1sample.statistic}",
                "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
            },
            ignore_index=True,
        )

        ttest_2sample = sps.ttest_ind(
            dp_matching_df.query(query_string).mAP.values, experiment_df.mAP.values
        )

        matching_df = matching_df.append(
            {
                "t-test": "Same perturbation type matching",
                "Feature_set": "DP-CP",
                f"{matching_experimental_variables[0]}": f"{experiment[0]}",
                f"{matching_experimental_variables[1]}": f"{experiment[1]}",
                f"{matching_experimental_variables[2]}": f"{experiment[2]}",
                "statistic": f"{ttest_2sample.statistic}",
                "pvalue": f"{utils.convert_pvalue(ttest_2sample.pvalue)}",
            },
            ignore_index=True,
        )

In [6]:
print(
    matching_df[
        ["t-test", "Feature_set"]
        + list(matching_experimental_variables)
        + ["statistic", "pvalue"]
    ].to_markdown(index=False)
)

| t-test                          | Feature_set   | Modality   | Cell   | time   |   statistic | pvalue   |
|:--------------------------------|:--------------|:-----------|:-------|:-------|------------:|:---------|
| Same perturbation type matching | CellProfiler  | compound   | A549   | long   |    1.26224  | 0.21     |
| Same perturbation type matching | CellProfiler  | compound   | A549   | short  |    0.723932 | 0.47     |
| Same perturbation type matching | CellProfiler  | compound   | U2OS   | long   |    4.8252   | <0.05    |
| Same perturbation type matching | DeepProfiler  | compound   | U2OS   | long   |   -4.73762  | <0.05    |
| Same perturbation type matching | DP-CP         | compound   | U2OS   | long   |   -7.06974  | <0.05    |
| Same perturbation type matching | CellProfiler  | compound   | U2OS   | short  |    2.13019  | <0.05    |
| Same perturbation type matching | CellProfiler  | crispr     | A549   | long   |   -1.29781  | 0.20     |
| Same perturbation type mat

### Compound to genetic perturbation matching mAP
- vs. baseline
- CellProfiler vs. DeepProfiler

In [7]:
for experiment, experiment_df in cp_gene_compound_matching_df.groupby(
    list(gene_compound_matching_experimental_variables)
):
    query_string = f'{gene_compound_matching_experimental_variables[0]}=="{experiment[0]}" and \
        {gene_compound_matching_experimental_variables[1]}=="{experiment[1]}" and \
            {gene_compound_matching_experimental_variables[2]}=="{experiment[2]}"'

    ttest_1sample = sps.ttest_1samp(experiment_df.mAP.values, 0)

    gene_compound_matching_df = gene_compound_matching_df.append(
        {
            "t-test": "Compoung-gene matching",
            "Feature_set": "CellProfiler",
            f"{gene_compound_matching_experimental_variables[0]}": f"{experiment[0]}",
            f"{gene_compound_matching_experimental_variables[1]}": f"{experiment[1]}",
            f"{gene_compound_matching_experimental_variables[2]}": f"{experiment[2]}",
            "statistic": f"{ttest_1sample.statistic}",
            "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
        },
        ignore_index=True,
    )

    if len(dp_gene_compound_matching_df.query(query_string)) > 0:
        ttest_1sample = sps.ttest_1samp(
            dp_gene_compound_matching_df.query(query_string).mAP.values, 0
        )
        gene_compound_matching_df = gene_compound_matching_df.append(
            {
                "t-test": "Compoung-gene matching",
                "Feature_set": "DeepProfiler",
                f"{gene_compound_matching_experimental_variables[0]}": f"{experiment[0]}",
                f"{gene_compound_matching_experimental_variables[1]}": f"{experiment[1]}",
                f"{gene_compound_matching_experimental_variables[2]}": f"{experiment[2]}",
                "statistic": f"{ttest_1sample.statistic}",
                "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
            },
            ignore_index=True,
        )

        ttest_2sample = sps.ttest_ind(
            dp_gene_compound_matching_df.query(query_string).mAP.values,
            experiment_df.mAP.values,
        )

        gene_compound_matching_df = gene_compound_matching_df.append(
            {
                "t-test": "Compoung-gene matching",
                "Feature_set": "DP-CP",
                f"{gene_compound_matching_experimental_variables[0]}": f"{experiment[0]}",
                f"{gene_compound_matching_experimental_variables[1]}": f"{experiment[1]}",
                f"{gene_compound_matching_experimental_variables[2]}": f"{experiment[2]}",
                "statistic": f"{ttest_2sample.statistic}",
                "pvalue": f"{utils.convert_pvalue(ttest_2sample.pvalue)}",
            },
            ignore_index=True,
        )

In [8]:
print(
    gene_compound_matching_df[
        ["t-test", "Feature_set"]
        + list(gene_compound_matching_experimental_variables)
        + ["statistic", "pvalue"]
    ].to_markdown(index=False)
)

| t-test                 | Feature_set   | Modality1      | Modality2    | Cell   |   statistic | pvalue   |
|:-----------------------|:--------------|:---------------|:-------------|:-------|------------:|:---------|
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_long  | A549   |    -9.44963 | <0.05    |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_long  | U2OS   |    -6.9846  | <0.05    |
| Compoung-gene matching | DeepProfiler  | compound_long  | crispr_long  | U2OS   |   -13.9409  | <0.05    |
| Compoung-gene matching | DP-CP         | compound_long  | crispr_long  | U2OS   |    -1.04339 | 0.30     |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_short | A549   |   -12.3498  | <0.05    |
| Compoung-gene matching | CellProfiler  | compound_long  | crispr_short | U2OS   |    -8.54345 | <0.05    |
| Compoung-gene matching | CellProfiler  | compound_long  | orf_long     | A549   |   -11.9376  | <0.05    |
| Compoung-gene mat

### Compound to genetic perturbation matching mAP
- ORF vs. CRISPR

In [9]:
crispr_experiments = ["crispr_long", "crispr_short"]
orf_experiments = ["orf_long", "orf_short"]

for experiment, experiment_df in cp_gene_compound_matching_df.groupby(
    list(crispr_orf_experimental_variables)
):
    query_string = f'{crispr_orf_experimental_variables[0]}=="{experiment[0]}" and \
        {crispr_orf_experimental_variables[1]}=="{experiment[1]}"'

    for crispr in crispr_experiments:
        crispr_df = experiment_df.query("Modality2==@crispr")
        for orf in orf_experiments:
            orf_df = experiment_df.query("Modality2==@orf")

            ttest_2sample = sps.ttest_ind(crispr_df.mAP.values, orf_df.mAP.values)

            crispr_orf_df = crispr_orf_df.append(
                {
                    "t-test": "Compoung-gene matching - CRISPR vs. ORF",
                    "Feature_set": "CellProfiler",
                    "compound-crispr": f"{experiment[0]}-{crispr}",
                    "Cell": f"{experiment[1]}",
                    "compound-orf": f"{experiment[0]}-{orf}",
                    "statistic": f"{ttest_2sample.statistic}",
                    "pvalue": f"{utils.convert_pvalue(ttest_2sample.pvalue)}",
                },
                ignore_index=True,
            )

    if len(dp_gene_compound_matching_df.query(query_string)) > 0:
        crispr_df = experiment_df.query('Modality2=="crispr_long"')
        orf_df = experiment_df.query('Modality2=="orf_long"')

        ttest_2sample = sps.ttest_ind(crispr_df.mAP.values, orf_df.mAP.values)

        crispr_orf_df = crispr_orf_df.append(
            {
                "t-test": "Compoung-gene matching - CRISPR vs. ORF",
                "Feature_set": "DeepProfiler",
                "compound-crispr": f"{experiment[0]}-crispr_long",
                "Cell": f"{experiment[1]}",
                "compound-orf": f"compound_long-orf_long",
                "statistic": f"{ttest_2sample.statistic}",
                "pvalue": f"{utils.convert_pvalue(ttest_2sample.pvalue)}",
            },
            ignore_index=True,
        )

In [10]:
print(crispr_orf_df[['t-test', 'Feature_set', 'Cell', 'compound-crispr','compound-orf','statistic', 'pvalue']].to_markdown(index=False))

| t-test                                  | Feature_set   | Cell   | compound-crispr             | compound-orf             |   statistic | pvalue   |
|:----------------------------------------|:--------------|:-------|:----------------------------|:-------------------------|------------:|:---------|
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | A549   | compound_long-crispr_long   | compound_long-orf_long   |    8.07241  | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | A549   | compound_long-crispr_long   | compound_long-orf_short  |    7.87323  | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | A549   | compound_long-crispr_short  | compound_long-orf_long   |    8.75641  | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | A549   | compound_long-crispr_short  | compound_long-orf_short  |    8.47962  | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | CellProfiler  | U2OS   | compound_long-crispr_long

Write to file

In [11]:
replicability_df[
    ["t-test", "Feature_set"]
    + list(replicability_experimental_variables)
    + ["statistic", "pvalue"]
].to_csv("output/replicability_pvalue.csv", index=False)
matching_df[
    ["t-test", "Feature_set"]
    + list(matching_experimental_variables)
    + ["statistic", "pvalue"]
].to_csv("output/matching_pvalue.csv", index=False)
gene_compound_matching_df[
    ["t-test", "Feature_set"]
    + list(gene_compound_matching_experimental_variables)
    + ["statistic", "pvalue"]
].to_csv("output/gene_compound_matching_pvalue.csv", index=False)
crispr_orf_df[
    [
        "t-test",
        "Feature_set",
        "Cell",
        "compound-crispr",
        "compound-orf",
        "statistic",
        "pvalue",
    ]
].to_csv("output/crispr_orf_pvalue.csv", index=False)