In [1]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
import pandas as pd
import scipy.stats as sps
import utils

Perform t-tests for the following scenarios to see whether the mAP values are different
- perturbation identification and matching vs. baseline.
- ORF vs. CRISPR matching to compounds. 

In [2]:
cp_replicability_df = pd.read_csv("output/cellprofiler_replicability_map.csv")
cp_matching_df = pd.read_csv("output/cellprofiler_matching_map.csv")
cp_gene_compound_matching_df = pd.read_csv(
    "output/cellprofiler_gene_compound_matching_map.csv"
)

replicability_df = pd.DataFrame()
matching_df = pd.DataFrame()
gene_compound_matching_df = pd.DataFrame()
crispr_orf_df = pd.DataFrame()

replicability_experimental_variables = ("Modality", "Cell", "time")
matching_experimental_variables = ("Modality", "Cell", "time")
gene_compound_matching_experimental_variables = ("Modality1", "Modality2", "Cell")
crispr_orf_experimental_variables = ["Modality1", "Cell"]

### Perturbation detection mAP vs. baseline

In [3]:
for experiment, experiment_df in cp_replicability_df.groupby(
    list(replicability_experimental_variables)
):
    query_string = f'{replicability_experimental_variables[0]}=="{experiment[0]}" and \
        {replicability_experimental_variables[1]}=="{experiment[1]}" and \
            {replicability_experimental_variables[2]}=="{experiment[2]}"'

    ttest_1sample = sps.ttest_1samp(experiment_df.mean_average_precision.values, 0)

    replicability_df = replicability_df.append(
        {
            "t-test": "Replicability",
            f"{replicability_experimental_variables[0]}": f"{experiment[0]}",
            f"{replicability_experimental_variables[1]}": f"{experiment[1]}",
            f"{replicability_experimental_variables[2]}": f"{experiment[2]}",
            "statistic": f"{ttest_1sample.statistic}",
            "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
        },
        ignore_index=True,
    )

In [4]:
print(
    replicability_df[
        ["t-test"]
        + list(replicability_experimental_variables)
        + ["statistic", "pvalue"]
    ].to_markdown(index=False)
)

| t-test        | Modality   | Cell   | time   |   statistic | pvalue   |
|:--------------|:-----------|:-------|:-------|------------:|:---------|
| Replicability | compound   | A549   | long   |     54.8318 | <0.05    |
| Replicability | compound   | A549   | short  |     33.5299 | <0.05    |
| Replicability | compound   | U2OS   | long   |     28.4879 | <0.05    |
| Replicability | compound   | U2OS   | short  |     35.3446 | <0.05    |
| Replicability | crispr     | A549   | long   |     32.6584 | <0.05    |
| Replicability | crispr     | A549   | short  |     33.3071 | <0.05    |
| Replicability | crispr     | U2OS   | long   |     32.5284 | <0.05    |
| Replicability | crispr     | U2OS   | short  |     33.2953 | <0.05    |
| Replicability | orf        | A549   | long   |     17.6019 | <0.05    |
| Replicability | orf        | A549   | short  |     17.6278 | <0.05    |
| Replicability | orf        | U2OS   | long   |     16.2907 | <0.05    |
| Replicability | orf        | U2OS   

### Within perturbation matching mAP vs. baseline

In [5]:
for experiment, experiment_df in cp_matching_df.groupby(
    list(matching_experimental_variables)
):
    query_string = f'{matching_experimental_variables[0]}=="{experiment[0]}" and \
        {matching_experimental_variables[1]}=="{experiment[1]}" and \
            {matching_experimental_variables[2]}=="{experiment[2]}"'

    ttest_1sample = sps.ttest_1samp(experiment_df.mean_average_precision.values, 0)

    matching_df = matching_df.append(
        {
            "t-test": "Same perturbation type matching",
            f"{matching_experimental_variables[0]}": f"{experiment[0]}",
            f"{matching_experimental_variables[1]}": f"{experiment[1]}",
            f"{matching_experimental_variables[2]}": f"{experiment[2]}",
            "statistic": f"{ttest_1sample.statistic}",
            "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
        },
        ignore_index=True,
    )


In [6]:
print(
    matching_df[
        ["t-test"]
        + list(matching_experimental_variables)
        + ["statistic", "pvalue"]
    ].to_markdown(index=False)
)

| t-test                          | Modality   | Cell   | time   |   statistic | pvalue   |
|:--------------------------------|:-----------|:-------|:-------|------------:|:---------|
| Same perturbation type matching | compound   | A549   | long   |    11.5802  | <0.05    |
| Same perturbation type matching | compound   | A549   | short  |    10.6303  | <0.05    |
| Same perturbation type matching | compound   | U2OS   | long   |    12.8986  | <0.05    |
| Same perturbation type matching | compound   | U2OS   | short  |    12.047   | <0.05    |
| Same perturbation type matching | crispr     | A549   | long   |     4.59245 | <0.05    |
| Same perturbation type matching | crispr     | A549   | short  |     4.22521 | <0.05    |
| Same perturbation type matching | crispr     | U2OS   | long   |     4.64332 | <0.05    |
| Same perturbation type matching | crispr     | U2OS   | short  |     4.1434  | <0.05    |


### Compound to genetic perturbation matching mAP vs. baseline

In [7]:
for experiment, experiment_df in cp_gene_compound_matching_df.groupby(
    list(gene_compound_matching_experimental_variables)
):
    query_string = f'{gene_compound_matching_experimental_variables[0]}=="{experiment[0]}" and \
        {gene_compound_matching_experimental_variables[1]}=="{experiment[1]}" and \
            {gene_compound_matching_experimental_variables[2]}=="{experiment[2]}"'

    ttest_1sample = sps.ttest_1samp(experiment_df.mean_average_precision.values, 0)

    gene_compound_matching_df = gene_compound_matching_df.append(
        {
            "t-test": "Compoung-gene matching",
            "Feature_set": "CellProfiler",
            f"{gene_compound_matching_experimental_variables[0]}": f"{experiment[0]}",
            f"{gene_compound_matching_experimental_variables[1]}": f"{experiment[1]}",
            f"{gene_compound_matching_experimental_variables[2]}": f"{experiment[2]}",
            "statistic": f"{ttest_1sample.statistic}",
            "pvalue": f"{utils.convert_pvalue(ttest_1sample.pvalue)}",
        },
        ignore_index=True,
    )

In [8]:
print(
    gene_compound_matching_df[
        ["t-test"]
        + list(gene_compound_matching_experimental_variables)
        + ["statistic", "pvalue"]
    ].to_markdown(index=False)
)

| t-test                 | Modality1      | Modality2    | Cell   |   statistic | pvalue   |
|:-----------------------|:---------------|:-------------|:-------|------------:|:---------|
| Compoung-gene matching | compound_long  | crispr_long  | A549   |     7.06491 | <0.05    |
| Compoung-gene matching | compound_long  | crispr_long  | U2OS   |     5.97384 | <0.05    |
| Compoung-gene matching | compound_long  | crispr_short | A549   |     6.83458 | <0.05    |
| Compoung-gene matching | compound_long  | crispr_short | U2OS   |     5.19219 | <0.05    |
| Compoung-gene matching | compound_long  | orf_long     | A549   |    13.6708  | <0.05    |
| Compoung-gene matching | compound_long  | orf_long     | U2OS   |     6.1091  | <0.05    |
| Compoung-gene matching | compound_long  | orf_short    | A549   |    14.0282  | <0.05    |
| Compoung-gene matching | compound_long  | orf_short    | U2OS   |     5.19261 | <0.05    |
| Compoung-gene matching | compound_short | crispr_long  | A549   |   

### Compound to genetic perturbation matching mAP - ORF vs. CRISPR

In [9]:
crispr_experiments = ["crispr_long", "crispr_short"]
orf_experiments = ["orf_long", "orf_short"]

for experiment, experiment_df in cp_gene_compound_matching_df.groupby(
    list(crispr_orf_experimental_variables)
):
    query_string = f'{crispr_orf_experimental_variables[0]}=="{experiment[0]}" and \
        {crispr_orf_experimental_variables[1]}=="{experiment[1]}"'

    for crispr in crispr_experiments:
        crispr_df = experiment_df.query("Modality2==@crispr")
        for orf in orf_experiments:
            orf_df = experiment_df.query("Modality2==@orf")

            ttest_2sample = sps.ttest_ind(crispr_df.mean_average_precision.values, orf_df.mean_average_precision.values)

            crispr_orf_df = crispr_orf_df.append(
                {
                    "t-test": "Compoung-gene matching - CRISPR vs. ORF",
                    "Feature_set": "CellProfiler",
                    "compound-crispr": f"{experiment[0]}-{crispr}",
                    "Cell": f"{experiment[1]}",
                    "compound-orf": f"{experiment[0]}-{orf}",
                    "statistic": f"{ttest_2sample.statistic}",
                    "pvalue": f"{utils.convert_pvalue(ttest_2sample.pvalue)}",
                },
                ignore_index=True,
            )

In [10]:
print(crispr_orf_df[['t-test', 'Cell', 'compound-crispr','compound-orf','statistic', 'pvalue']].to_markdown(index=False))

| t-test                                  | Cell   | compound-crispr             | compound-orf             |   statistic | pvalue   |
|:----------------------------------------|:-------|:----------------------------|:-------------------------|------------:|:---------|
| Compoung-gene matching - CRISPR vs. ORF | A549   | compound_long-crispr_long   | compound_long-orf_long   |    -4.78669 | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | A549   | compound_long-crispr_long   | compound_long-orf_short  |    -4.07287 | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | A549   | compound_long-crispr_short  | compound_long-orf_long   |    -5.07941 | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | A549   | compound_long-crispr_short  | compound_long-orf_short  |    -4.3409  | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | U2OS   | compound_long-crispr_long   | compound_long-orf_long   |    -3.18953 | <0.05    |
| Compoung-gene matching - CRISPR vs. ORF | U2OS   | co

Write to file

In [11]:
replicability_df[
    ["t-test"]
    + list(replicability_experimental_variables)
    + ["statistic", "pvalue"]
].to_csv("output/replicability_pvalue.csv", index=False)
matching_df[
    ["t-test"]
    + list(matching_experimental_variables)
    + ["statistic", "pvalue"]
].to_csv("output/matching_pvalue.csv", index=False)
gene_compound_matching_df[
    ["t-test"]
    + list(gene_compound_matching_experimental_variables)
    + ["statistic", "pvalue"]
].to_csv("output/gene_compound_matching_pvalue.csv", index=False)
crispr_orf_df[
    [
        "t-test",
        "Cell",
        "compound-crispr",
        "compound-orf",
        "statistic",
        "pvalue",
    ]
].to_csv("output/crispr_orf_pvalue.csv", index=False)