In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

In [2]:
operations = {
    "orf": "wellpos_cc_var_mad_outlier_featselect_sphering_harmony",
    "crispr": "wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected",
}

variable = "homologue_existence"
col_name = f"Metadata_{variable}"
col_name_in_metadata = "Metadata_homologue"
output_df = pd.DataFrame()

In [3]:
# Read phenotypic activity

orf_phenotypic_activity_df = pd.read_csv(
    f"../03.retrieve-annotations/output/phenotypic-activity-{operations['orf']}.csv.gz"
)[["Metadata_JCP2022", "mean_average_precision", "below_corrected_p"]]

crispr_phenotypic_activity_df = pd.read_csv(
    f"../03.retrieve-annotations/output/phenotypic-activity-{operations['crispr']}.csv.gz"
)[["Metadata_JCP2022", "mean_average_precision", "below_corrected_p"]]

In [4]:
# Read metadata

orf_metadata_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/orf_metadata.tsv.gz", sep="\t"
    )[["Metadata_JCP2022", col_name_in_metadata]]
    .assign(col=lambda x: x[col_name_in_metadata])
    .rename(columns={"col": col_name})
)

crispr_metadata_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz", sep="\t"
    )[["Metadata_JCP2022", col_name_in_metadata]]
    .assign(col=lambda x: x[col_name_in_metadata])
    .rename(columns={"col": col_name})
)

In [5]:
orf_df = orf_phenotypic_activity_df.merge(
    orf_metadata_df, on="Metadata_JCP2022", how="inner"
).dropna(subset=[col_name])
crispr_df = crispr_phenotypic_activity_df.merge(
    crispr_metadata_df, on="Metadata_JCP2022", how="inner"
).dropna(subset=[col_name])

Fisher's exact test ORF

In [6]:
table = sm.stats.Table.from_data(orf_df[["below_corrected_p", col_name]])
print(
    pd.DataFrame(
        table.table,
        columns=[f"{variable}_False", f"{variable}_True"],
        index=["phenotype_False", "phenotype_True"],
    ).to_markdown()
)

odds_ratio, pvalue = stats.fisher_exact(table.table)
res = stats.contingency.odds_ratio(table.table.astype(int))
ci_low, ci_high = res.confidence_interval(confidence_level=0.95)
print(f"odds ratio: {odds_ratio}, p-value: {pvalue}, confidence interval: {ci_low} - {ci_high}")

output_df = pd.concat(
    [
        output_df,
        pd.DataFrame(
            {
                "modality": ["ORF"],
                f"{variable}_False_phenotype_False": int(table.table[0, 0]),
                f"{variable}_False_phenotype_True": int(table.table[0, 1]),
                f"{variable}_True_phenotype_False": int(table.table[1, 0]),
                f"{variable}_True_phenotype_True": int(table.table[1, 1]),
                "odds_ratio": odds_ratio,
                "pvalue": pvalue,
                "confidence_interval_low": ci_low,
                "confidence_interval_high": ci_high,
            },
            index=[0],
        ),
    ], 
    ignore_index=True
)

|                 |   homologue_existence_False |   homologue_existence_True |
|:----------------|----------------------------:|---------------------------:|
| phenotype_False |                         150 |                       5036 |
| phenotype_True  |                         200 |                       6896 |
odds ratio: 1.0270055599682288, p-value: 0.8262674430677552, confidence interval: 0.8227959795708486 - 1.2797068302581898


There is no correlation between homologue existence and phenotypic activity

Fisher's exact test CRISPR

In [7]:
table = sm.stats.Table.from_data(crispr_df[["below_corrected_p", col_name]])
print(
    pd.DataFrame(
        table.table,
        columns=[f"{variable}_False", f"{variable}_True"],
        index=["phenotype_False", "phenotype_True"],
    ).to_markdown()
)

odds_ratio, pvalue = stats.fisher_exact(table.table)
res = stats.contingency.odds_ratio(table.table.astype(int))
ci_low, ci_high = res.confidence_interval(confidence_level=0.95)
print(f"odds ratio: {odds_ratio}, p-value: {pvalue}, confidence interval: {ci_low} - {ci_high}")

output_df = pd.concat(
    [
        output_df,
        pd.DataFrame(
            {
                "modality": ["CRISPR"],
                f"{variable}_False_phenotype_False": int(table.table[0, 0]),
                f"{variable}_False_phenotype_True": int(table.table[0, 1]),
                f"{variable}_True_phenotype_False": int(table.table[1, 0]),
                f"{variable}_True_phenotype_True": int(table.table[1, 1]),
                "odds_ratio": odds_ratio,
                "pvalue": pvalue,
                "confidence_interval_low": ci_low,
                "confidence_interval_high": ci_high,
            },
            index=[0],
        ),
    ], 
    ignore_index=True
)


|                 |   homologue_existence_False |   homologue_existence_True |
|:----------------|----------------------------:|---------------------------:|
| phenotype_False |                          22 |                       2339 |
| phenotype_True  |                          47 |                       5292 |
odds ratio: 1.0590450547151447, p-value: 0.7946975411976342, confidence interval: 0.6062844709055321 - 1.7973494134994417


There is no correlation between homologue existence and phenotypic activity.

In [8]:
output_df.to_csv(f"output/{variable}.csv", index=False)