In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

In [2]:
operations = {
    "orf": "wellpos_cc_var_mad_outlier_featselect_sphering_harmony",
    "crispr": "wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected",
}

variable = "involvement_in_disease"
col_name = f"Metadata_{variable}"
col_name_in_metadata = "Metadata_disease_involvement"
output_df = pd.DataFrame()
cancer_output_df = pd.DataFrame()

In [3]:
# Read phenotypic activity

orf_phenotypic_activity_df = pd.read_csv(
    f"../03.retrieve-annotations/output/phenotypic-activity-{operations['orf']}.csv.gz"
)[["Metadata_JCP2022", "mean_average_precision", "below_corrected_p"]]

crispr_phenotypic_activity_df = pd.read_csv(
    f"../03.retrieve-annotations/output/phenotypic-activity-{operations['crispr']}.csv.gz"
)[["Metadata_JCP2022", "mean_average_precision", "below_corrected_p"]]

In [4]:
# Read metadata

orf_metadata_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/orf_metadata.tsv.gz", sep="\t"
    )[["Metadata_JCP2022", col_name_in_metadata]]
    .assign(col=lambda x: np.where(x[col_name_in_metadata].notna(), True, False))
    .assign(cancer=lambda x: np.where(x[col_name_in_metadata].str.contains("Cancer-related"), True, False))
    .rename(columns={"col": col_name})
)

crispr_metadata_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz", sep="\t"
    )[["Metadata_JCP2022", col_name_in_metadata]]
    .assign(col=lambda x: np.where(x[col_name_in_metadata].notna(), True, False))
    .assign(cancer=lambda x: np.where(x[col_name_in_metadata].str.contains("Cancer-related"), True, False))
    .rename(columns={"col": col_name})
)

In [5]:
orf_df = orf_phenotypic_activity_df.merge(
    orf_metadata_df, on="Metadata_JCP2022", how="inner"
).dropna(subset=[col_name])
crispr_df = crispr_phenotypic_activity_df.merge(
    crispr_metadata_df, on="Metadata_JCP2022", how="inner"
).dropna(subset=[col_name])

Fisher's exact test ORF

In [6]:
table = sm.stats.Table.from_data(orf_df[["below_corrected_p", col_name]])
print(
    pd.DataFrame(
        table.table,
        columns=[f"{variable}_False", f"{variable}_True"],
        index=["phenotype_False", "phenotype_True"],
    ).to_markdown()
)

odds_ratio, pvalue = stats.fisher_exact(table.table)
print(f"odds ratio: {odds_ratio}, p-value: {pvalue}")

output_df = pd.concat(
    [
        output_df,
        pd.DataFrame(
            {
                "modality": ["ORF"],
                f"{variable}_False_phenotype_False": int(table.table[0, 0]),
                f"{variable}_False_phenotype_True": int(table.table[0, 1]),
                f"{variable}_True_phenotype_False": int(table.table[1, 0]),
                f"{variable}_True_phenotype_True": int(table.table[1, 1]),
                "odds_ratio": odds_ratio,
                "pvalue": pvalue,
            },
            index=[0],
        ),
    ], 
    ignore_index=True
)

|                 |   involvement_in_disease_False |   involvement_in_disease_True |
|:----------------|-------------------------------:|------------------------------:|
| phenotype_False |                           4218 |                          1704 |
| phenotype_True  |                           5274 |                          2543 |
odds ratio: 1.1935571525474424, p-value: 2.3976418901797013e-06


Odd's ratio being greater than 1 indicates that involvement in disease and phenotypic activity are positively correlated. This means that genes that are more likely to be involved in disease are more likely to have a phenotype.

Fisher's exact test CRISPR

In [7]:
table = sm.stats.Table.from_data(crispr_df[["below_corrected_p", col_name]])
print(
    pd.DataFrame(
        table.table,
        columns=[f"{variable}_False", f"{variable}_True"],
        index=["phenotype_False", "phenotype_True"],
    ).to_markdown()
)

odds_ratio, pvalue = stats.fisher_exact(table.table)
print(f"odds ratio: {odds_ratio}, p-value: {pvalue}")

output_df = pd.concat(
    [
        output_df,
        pd.DataFrame(
            {
                "modality": ["CRISPR"],
                f"{variable}_False_phenotype_False": int(table.table[0, 0]),
                f"{variable}_False_phenotype_True": int(table.table[0, 1]),
                f"{variable}_True_phenotype_False": int(table.table[1, 0]),
                f"{variable}_True_phenotype_True": int(table.table[1, 1]),
                "odds_ratio": odds_ratio,
                "pvalue": pvalue,
            },
            index=[0],
        ),
    ], 
    ignore_index=True
)


|                 |   involvement_in_disease_False |   involvement_in_disease_True |
|:----------------|-------------------------------:|------------------------------:|
| phenotype_False |                           1414 |                          1011 |
| phenotype_True  |                           3089 |                          2457 |
odds ratio: 1.1124628119497442, p-value: 0.03074647586672849


Odd's ratio being greater than 1 indicates that involvement in disease and phenotypic activity are positively correlated. This means that genes that are more likely to be involved in disease are more likely to have a phenotype.

In [8]:
output_df.to_csv(f"output/{variable}.csv", index=False)

### Cancer-related genes

In [9]:
table = sm.stats.Table.from_data(orf_df[["below_corrected_p", "cancer"]])
print(
    pd.DataFrame(
        table.table,
        columns=[f"cancer_False", f"cancer_True"],
        index=["phenotype_False", "phenotype_True"],
    ).to_markdown()
)

odds_ratio, pvalue = stats.fisher_exact(table.table)
print(f"odds ratio: {odds_ratio}, p-value: {pvalue}")

cancer_output_df = pd.concat(
    [
        cancer_output_df,
        pd.DataFrame(
            {
                "modality": ["ORF"],
                f"cancer_False_phenotype_False": int(table.table[0, 0]),
                f"cancer_False_phenotype_True": int(table.table[0, 1]),
                f"cancer_True_phenotype_False": int(table.table[1, 0]),
                f"cancer_True_phenotype_True": int(table.table[1, 1]),
                "odds_ratio": odds_ratio,
                "pvalue": pvalue,
            },
            index=[0],
        ),
    ], 
    ignore_index=True
)

|                 |   cancer_False |   cancer_True |
|:----------------|---------------:|--------------:|
| phenotype_False |           1225 |          4697 |
| phenotype_True  |           1724 |          6093 |
odds ratio: 0.9217421447367056, p-value: 0.0536058209763923


In [10]:
table = sm.stats.Table.from_data(crispr_df[["below_corrected_p", "cancer"]])
print(
    pd.DataFrame(
        table.table,
        columns=[f"cancer_False", f"cancer_True"],
        index=["phenotype_False", "phenotype_True"],
    ).to_markdown()
)

odds_ratio, pvalue = stats.fisher_exact(table.table)
print(f"odds ratio: {odds_ratio}, p-value: {pvalue}")

cancer_output_df = pd.concat(
    [
        cancer_output_df,
        pd.DataFrame(
            {
                "modality": ["CRISPR"],
                f"cancer_False_phenotype_False": int(table.table[0, 0]),
                f"cancer_False_phenotype_True": int(table.table[0, 1]),
                f"cancer_True_phenotype_False": int(table.table[1, 0]),
                f"cancer_True_phenotype_True": int(table.table[1, 1]),
                "odds_ratio": odds_ratio,
                "pvalue": pvalue,
            },
            index=[0],
        ),
    ], 
    ignore_index=True
)


|                 |   cancer_False |   cancer_True |
|:----------------|---------------:|--------------:|
| phenotype_False |            693 |          1732 |
| phenotype_True  |           1650 |          3896 |
odds ratio: 0.944757505773672, p-value: 0.297418536829328


In [11]:
cancer_output_df.to_csv(f"output/cancer.csv", index=False)