In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels

In [2]:
protein_class_list = [
    "Enzymes",
    "Transporters",
    "G-protein coupled receptors",
    "Transcription factors",
    "Immunoglobulin genes",
    "T-cell receptor genes",
    "Predicted secreted proteins",
    "Predicted membrane proteins",
]

variable = "protein_class"
col_name = f"Metadata_{variable}"
col_name_in_metadata = "Metadata_protein_class"
output_df = pd.DataFrame()

In [3]:
crispr_plate_metadata_df = pd.read_csv(
    "output/crispr_well_plate_platemap.csv.gz",
    usecols=["Metadata_JCP2022", "Metadata_Platemap"],
)
crispr_plate_metadata_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_Platemap
0,JCP2022_085227,platemap_1
1,JCP2022_800002,platemap_1
2,JCP2022_800573,platemap_1
3,JCP2022_806794,platemap_1
4,JCP2022_802800,platemap_1


In [4]:
crispr_annotation_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz",
        sep="\t",
        usecols=["Metadata_JCP2022", col_name_in_metadata],
    )
    .assign(col=lambda x: x[col_name_in_metadata].str.split("|"))
    .explode("col")
    .query("col in @protein_class_list")
    .drop(columns=[col_name_in_metadata])
    .rename(columns={"col": col_name})
    .drop_duplicates(subset="Metadata_JCP2022", keep="first")
)

crispr_df = (
    crispr_plate_metadata_df.merge(crispr_annotation_df, on="Metadata_JCP2022", how="left")
    .dropna(subset=[col_name])
    .drop_duplicates(subset=["Metadata_JCP2022", "Metadata_Platemap"])
)

crispr_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_Platemap,Metadata_protein_class
4,JCP2022_802800,platemap_1,Enzymes
5,JCP2022_802216,platemap_1,Enzymes
7,JCP2022_800515,platemap_1,Enzymes
8,JCP2022_806435,platemap_1,Predicted membrane proteins
9,JCP2022_807181,platemap_1,Enzymes


Fisher's exact test

In [5]:
fisher_df = pd.DataFrame(
    columns=[
        "platemap",
        f"{variable}_False_in_other_platemaps" f"{variable}_False_in_platemap",
        f"{variable}_True_in_other_platemaps",
        f"{variable}_True_in_platemap",
        "odds_ratio",
        "pvalue",
        "adjusted_pvalue",
    ]
)
for protein_class in protein_class_list:
    protein_class_df = pd.DataFrame()
    for platemap in crispr_df.Metadata_Platemap.unique():
        df = crispr_df.assign(
            platemap_x=lambda x: np.where(x.Metadata_Platemap == platemap, True, False)
        ).assign(
            protein_class_x=lambda x: np.where(
                x.Metadata_protein_class == protein_class, True, False
            )
        )

        table = sm.stats.Table.from_data(df[["platemap_x", "protein_class_x"]])
        if table.table.shape != (2, 2):
            continue

        odds_ratio, pvalue = stats.fisher_exact(table.table)

        protein_class_df = pd.concat(
            [
                protein_class_df,
                pd.DataFrame(
                    {
                        "protein_class": protein_class,
                        "platemap": platemap,
                        f"{variable}_False_in_other_platemaps": int(table.table[0, 0]),
                        f"{variable}_False_in_platemap": int(table.table[0, 1]),
                        f"{variable}_True_in_other_platemaps": int(table.table[1, 0]),
                        f"{variable}_True_in_platemap": int(table.table[1, 1]),
                        "odds_ratio": odds_ratio,
                        "pvalue": pvalue,
                    },
                    index=[0],
                ),
            ],
            ignore_index=True,
        )
    
    if table.table.shape != (2, 2):
        continue
    
    protein_class_df["adjusted_pvalue"] = statsmodels.stats.multitest.fdrcorrection(
        protein_class_df.pvalue
    )[1]
    output_df = pd.concat([output_df, protein_class_df], ignore_index=True)

In [6]:
print(
    output_df.sort_values(by="adjusted_pvalue")
    .query("adjusted_pvalue < 0.05")
    .to_markdown(index=False)
)

output_df.to_csv(f"output/{variable}_fisher_exact_crispr.tsv", sep="\t", index=False)

| protein_class               | platemap    |   protein_class_False_in_other_platemaps |   protein_class_False_in_platemap |   protein_class_True_in_other_platemaps |   protein_class_True_in_platemap |   odds_ratio |       pvalue |   adjusted_pvalue |
|:----------------------------|:------------|-----------------------------------------:|----------------------------------:|----------------------------------------:|---------------------------------:|-------------:|-------------:|------------------:|
| G-protein coupled receptors | platemap_15 |                                     6114 |                               138 |                                      93 |                              221 |  105.282     | 7.3269e-236  |      2.05153e-234 |
| Predicted membrane proteins | platemap_17 |                                     5233 |                              1026 |                                      27 |                              280 |   52.8929    | 5.36512e-172 |      1.50223