In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels

In [2]:
protein_class_list = [
    "Enzymes",
    "Transporters",
    "G-protein coupled receptors",
    "Transcription factors",
    "Immunoglobulin genes",
    "T-cell receptor genes",
    "Predicted secreted proteins",
    "Predicted membrane proteins",
]

variable = "protein_class"
col_name = f"Metadata_{variable}"
col_name_in_metadata = "Metadata_protein_class"
output_df = pd.DataFrame()

In [3]:
orf_plate_metadata_df = pd.read_csv(
    "output/orf_well_plate_platemap.csv.gz",
    usecols=["Metadata_JCP2022", "Metadata_Platemap"],
)
orf_plate_metadata_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_Platemap
0,JCP2022_905588,platemap_1
1,JCP2022_912241,platemap_1
2,JCP2022_900266,platemap_1
3,JCP2022_915129,platemap_1
4,JCP2022_907177,platemap_1


In [4]:
orf_annotation_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/orf_metadata.tsv.gz",
        sep="\t",
        usecols=["Metadata_JCP2022", col_name_in_metadata],
    )
    .assign(col=lambda x: x[col_name_in_metadata].str.split("|"))
    .explode("col")
    .query("col in @protein_class_list")
    .drop(columns=[col_name_in_metadata])
    .rename(columns={"col": col_name})
    .drop_duplicates(subset="Metadata_JCP2022", keep="first")
)

orf_df = (
    orf_plate_metadata_df.merge(orf_annotation_df, on="Metadata_JCP2022", how="left")
    .dropna(subset=[col_name])
    .drop_duplicates(subset=["Metadata_JCP2022", "Metadata_Platemap"])
)

orf_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_Platemap,Metadata_protein_class
0,JCP2022_905588,platemap_1,Enzymes
2,JCP2022_900266,platemap_1,Transcription factors
4,JCP2022_907177,platemap_1,Transcription factors
5,JCP2022_904426,platemap_1,Transcription factors
6,JCP2022_902262,platemap_1,Transcription factors


Fisher's exact test

In [5]:
fisher_df = pd.DataFrame(
    columns=[
        "platemap",
        f"{variable}_False_in_other_platemaps" f"{variable}_False_in_platemap",
        f"{variable}_True_in_other_platemaps",
        f"{variable}_True_in_platemap",
        "odds_ratio",
        "pvalue",
        "adjusted_pvalue",
    ]
)
for protein_class in protein_class_list:
    protein_class_df = pd.DataFrame()
    for platemap in orf_df.Metadata_Platemap.unique():
        df = orf_df.assign(
            platemap_x=lambda x: np.where(x.Metadata_Platemap == platemap, True, False)
        ).assign(
            protein_class_x=lambda x: np.where(
                x.Metadata_protein_class == protein_class, True, False
            )
        )

        table = sm.stats.Table.from_data(df[["platemap_x", "protein_class_x"]])

        odds_ratio, pvalue = stats.fisher_exact(table.table)

        protein_class_df = pd.concat(
            [
                protein_class_df,
                pd.DataFrame(
                    {
                        "protein_class": protein_class,
                        "platemap": platemap,
                        f"{variable}_False_in_other_platemaps": int(table.table[0, 0]),
                        f"{variable}_False_in_platemap": int(table.table[0, 1]),
                        f"{variable}_True_in_other_platemaps": int(table.table[1, 0]),
                        f"{variable}_True_in_platemap": int(table.table[1, 1]),
                        "odds_ratio": odds_ratio,
                        "pvalue": pvalue,
                    },
                    index=[0],
                ),
            ],
            ignore_index=True,
        )

    protein_class_df["adjusted_pvalue"] = statsmodels.stats.multitest.fdrcorrection(
        protein_class_df.pvalue
    )[1]
    output_df = pd.concat([output_df, protein_class_df], ignore_index=True)

In [6]:
print(
    output_df.sort_values(by="adjusted_pvalue")
    .query("adjusted_pvalue < 0.05")
    .to_markdown(index=False)
)

output_df.to_csv(f"output/{variable}_fisher_exact_orf.tsv", sep="\t", index=False)

| protein_class               | platemap    |   protein_class_False_in_other_platemaps |   protein_class_False_in_platemap |   protein_class_True_in_other_platemaps |   protein_class_True_in_platemap |   odds_ratio |       pvalue |   adjusted_pvalue |
|:----------------------------|:------------|-----------------------------------------:|----------------------------------:|----------------------------------------:|---------------------------------:|-------------:|-------------:|------------------:|
| Transcription factors       | platemap_3  |                                     7771 |                               705 |                                      41 |                              174 |   46.7792    | 2.71461e-138 |      1.22158e-136 |
| Transcription factors       | platemap_1  |                                     7747 |                               711 |                                      65 |                              168 |   28.1618    | 2.12676e-118 |      4.78521