In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm

In [2]:
operations = {
    "orf": "wellpos_cc_var_mad_outlier_featselect_sphering_harmony",
    "crispr": "wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected",
}

protein_class_list = [
    "Enzymes",
    "Transporters",
    "G-protein coupled receptors",
    "Transcription factors",
    "Immunoglobulin genes",
    "T-cell receptor genes",
    "Predicted secreted proteins",
    "Predicted membrane proteins",
]

variable = "protein_class"
col_name = f"Metadata_{variable}"
col_name_in_metadata = "Metadata_protein_class"
output_df = pd.DataFrame()

In [3]:
# Read phenotypic activity

orf_phenotypic_activity_df = pd.read_csv(
    f"../03.retrieve-annotations/output/phenotypic-activity-{operations['orf']}.csv.gz"
)[["Metadata_JCP2022", "mean_average_precision", "below_corrected_p"]]

crispr_phenotypic_activity_df = pd.read_csv(
    f"../03.retrieve-annotations/output/phenotypic-activity-{operations['crispr']}.csv.gz"
)[["Metadata_JCP2022", "mean_average_precision", "below_corrected_p"]]

In [4]:
# Read metadata

orf_metadata_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/orf_metadata.tsv.gz", sep="\t"
    )[["Metadata_JCP2022", col_name_in_metadata]]
    .assign(col=lambda x: x[col_name_in_metadata].str.split("|"))
    .explode("col")
    .query("col in @protein_class_list")
    .drop(columns=[col_name_in_metadata])
    .rename(columns={"col": col_name})
    .drop_duplicates(subset="Metadata_JCP2022", keep="first")
)

crispr_metadata_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz", sep="\t"
    )[["Metadata_JCP2022", col_name_in_metadata]]
    .assign(col=lambda x: x[col_name_in_metadata].str.split("|"))
    .explode("col")
    .query("col in @protein_class_list")
    .drop(columns=[col_name_in_metadata])
    .rename(columns={"col": col_name})
    .drop_duplicates(subset="Metadata_JCP2022", keep="first")
)

In [5]:
orf_df = orf_phenotypic_activity_df.merge(
    orf_metadata_df, on="Metadata_JCP2022", how="inner"
).dropna(subset=[col_name])
crispr_df = crispr_phenotypic_activity_df.merge(
    crispr_metadata_df, on="Metadata_JCP2022", how="inner"
).dropna(subset=[col_name])

In [6]:
# Create binary column for each protein class
for protein_class in protein_class_list:
    orf_df = orf_df.assign(
        **{
            protein_class: lambda x: x.apply(
                lambda y: True if y[col_name] == protein_class else False, axis=1
            )
        }
    )

    crispr_df = crispr_df.assign(
        **{
            protein_class: lambda x: x.apply(
                lambda y: True if y[col_name] == protein_class else False, axis=1
            )
        }
    )

Fisher's exact test ORF

In [7]:
for protein_class in protein_class_list:
    table = sm.stats.Table.from_data(orf_df[["below_corrected_p", protein_class]])

    odds_ratio, pvalue = stats.fisher_exact(table.table)
    res = stats.contingency.odds_ratio(table.table.astype(int))
    ci_low, ci_high = res.confidence_interval(confidence_level=0.95)
    print(f"odds ratio: {odds_ratio}, p-value: {pvalue}, confidence interval: {ci_low} - {ci_high}")

    output_df = pd.concat(
        [
            output_df,
            pd.DataFrame(
                {
                    "protein_class": protein_class,
                    "modality": ["ORF"],
                    f"{variable}_False_phenotype_False": int(table.table[0, 0]),
                    f"{variable}_False_phenotype_True": int(table.table[0, 1]),
                    f"{variable}_True_phenotype_False": int(table.table[1, 0]),
                    f"{variable}_True_phenotype_True": int(table.table[1, 1]),
                    "odds_ratio": odds_ratio,
                    "pvalue": pvalue,
                    "confidence_interval_low": ci_low,
                    "confidence_interval_high": ci_high,
                },
                index=[0],
            ),
        ], 
        ignore_index=True
    )

output_df.query("modality == 'ORF' and pvalue < 0.05")

odds ratio: 0.9949103187563719, p-value: 0.9250511857618275, confidence interval: 0.9063399496652427 - 1.0922288015076935
odds ratio: 0.9144278406573488, p-value: 0.4617666266071766, confidence interval: 0.7154385763139027 - 1.1714352978981222
odds ratio: 1.1165589995448773, p-value: 0.34336725511241745, confidence interval: 0.8907592891241077 - 1.4037902828086615
odds ratio: 0.9638579250315878, p-value: 0.6264591818989922, confidence interval: 0.8305981840122924 - 1.1194085299606236
odds ratio: 0.7616330616330617, p-value: 0.4727562861187794, confidence interval: 0.35722564044814703 - 1.6405261707665624
odds ratio: 0.09592011238013803, p-value: 0.008797768065473482, confidence interval: 0.0021278497246290493 - 0.747396970909215
odds ratio: 0.842298943673855, p-value: 0.024421526174343556, confidence interval: 0.7246651601124018 - 0.9796100520353866
odds ratio: 1.111071927396797, p-value: 0.03775611075361108, confidence interval: 1.0056371775102504 - 1.227826606684178


Unnamed: 0,protein_class,modality,protein_class_False_phenotype_False,protein_class_False_phenotype_True,protein_class_True_phenotype_False,protein_class_True_phenotype_True,odds_ratio,pvalue,confidence_interval_low,confidence_interval_high
5,T-cell receptor genes,ORF,3141,7,4678,1,0.09592,0.008798,0.002128,0.747397
6,Predicted secreted proteins,ORF,2798,350,4233,446,0.842299,0.024422,0.724665,0.97961
7,Predicted membrane proteins,ORF,2229,919,3209,1470,1.111072,0.037756,1.005637,1.227827


Fisher's exact test CRISPR

In [8]:
for protein_class in protein_class_list:
    table = sm.stats.Table.from_data(crispr_df[["below_corrected_p", protein_class]])
    if table.table.shape != (2, 2):
        continue

    odds_ratio, pvalue = stats.fisher_exact(table.table)
    res = stats.contingency.odds_ratio(table.table.astype(int))
    ci_low, ci_high = res.confidence_interval(confidence_level=0.95)
    print(f"odds ratio: {odds_ratio}, p-value: {pvalue}, confidence interval: {ci_low} - {ci_high}")

    output_df = pd.concat(
        [
            output_df,
            pd.DataFrame(
                {
                    "protein_class": protein_class,
                    "modality": ["CRISPR"],
                    f"{variable}_False_phenotype_False": int(table.table[0, 0]),
                    f"{variable}_False_phenotype_True": int(table.table[0, 1]),
                    f"{variable}_True_phenotype_False": int(table.table[1, 0]),
                    f"{variable}_True_phenotype_True": int(table.table[1, 1]),
                    "odds_ratio": odds_ratio,
                    "pvalue": pvalue,
                    "confidence_interval_low": ci_low,
                    "confidence_interval_high": ci_high,
                },
                index=[0],
            ),
        ], 
        ignore_index=True
    )

output_df.query("modality == 'CRISPR' and pvalue < 0.05")

odds ratio: 1.163054380532903, p-value: 0.006196696490823536, confidence interval: 1.0432117111042352 - 1.296640576638658
odds ratio: 1.5042664670658683, p-value: 0.04989331541426369, confidence interval: 0.9998750067267231 - 2.318883874543217
odds ratio: 0.9000020950744799, p-value: 0.37535562618973695, confidence interval: 0.7132258846259627 - 1.1405077771276468
odds ratio: 1.1090938728936162, p-value: 0.17611840451904198, confidence interval: 0.9531365274313217 - 1.2927407360105423
odds ratio: 0.5606444134276455, p-value: 4.505137289271188e-06, confidence interval: 0.4368847730942377 - 0.7208516103473468
odds ratio: 0.8468356861615288, p-value: 0.014733709159547548, confidence interval: 0.74050497129095 - 0.9693048759389199


Unnamed: 0,protein_class,modality,protein_class_False_phenotype_False,protein_class_False_phenotype_True,protein_class_True_phenotype_False,protein_class_True_phenotype_True,odds_ratio,pvalue,confidence_interval_low,confidence_interval_high
8,Enzymes,CRISPR,980,966,1994,2286,1.163054,0.006197,1.043212,1.296641
9,Transporters,CRISPR,1914,32,4175,105,1.504266,0.049893,0.999875,2.318884
12,Predicted secreted proteins,CRISPR,1823,123,4124,156,0.560644,5e-06,0.436885,0.720852
13,Predicted membrane proteins,CRISPR,1526,420,3471,809,0.846836,0.014734,0.740505,0.969305


In [9]:
output_df.to_csv(f"output/{variable}.csv", index=False)