In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels

In [2]:
variable = "expression"
col_name = f"Metadata_{variable}"
col_name_in_metadata = "Metadata_TPM"
output_df = pd.DataFrame()

In [3]:
orf_plate_metadata_df = pd.read_csv(
    "output/orf_well_plate_platemap.csv.gz",
    usecols=["Metadata_JCP2022", "Metadata_Platemap"],
)
orf_plate_metadata_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_Platemap
0,JCP2022_905588,platemap_1
1,JCP2022_912241,platemap_1
2,JCP2022_900266,platemap_1
3,JCP2022_915129,platemap_1
4,JCP2022_907177,platemap_1


In [4]:
orf_annotation_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/orf_metadata.tsv.gz",
        sep="\t",
        usecols=["Metadata_JCP2022", col_name_in_metadata],
    )
    .assign(
        col=lambda x: np.where(
            x[col_name_in_metadata] > np.nanpercentile(x.Metadata_TPM, 75),
            "True",
            "False",
        )
    )
    .rename(columns={"col": col_name})
)

orf_df = (
    orf_plate_metadata_df.merge(orf_annotation_df, on="Metadata_JCP2022", how="left")
    .dropna(subset=[col_name])
    .drop_duplicates(subset=["Metadata_JCP2022", "Metadata_Platemap"])
)

orf_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_Platemap,Metadata_TPM,Metadata_expression
0,JCP2022_905588,platemap_1,5.727104,True
1,JCP2022_912241,platemap_1,0.0,False
2,JCP2022_900266,platemap_1,0.0,False
3,JCP2022_915129,platemap_1,,False
4,JCP2022_907177,platemap_1,4.035624,False


Fisher's exact test

In [5]:
fisher_df = pd.DataFrame(
    columns=[
        "platemap",
        f"{variable}_False_in_other_platemaps"
        f"{variable}_False_in_platemap",
        f"{variable}_True_in_other_platemaps",
        f"{variable}_True_in_platemap",
        "odds_ratio",
        "pvalue",
        "adjusted_pvalue",
    ]
)

for platemap in orf_df.Metadata_Platemap.unique():
    df = orf_df.assign(platemap_x = lambda x: np.where(x.Metadata_Platemap == platemap, True, False))
    table = sm.stats.Table.from_data(df[["platemap_x", col_name]])

    odds_ratio, pvalue = stats.fisher_exact(table.table)

    output_df = pd.concat(
        [
            output_df,
            pd.DataFrame(
                {
                    "platemap": platemap,
                    f"{variable}_False_in_other_platemaps": int(table.table[0, 0]),
                    f"{variable}_False_in_platemap": int(table.table[0, 1]),
                    f"{variable}_True_in_other_platemaps": int(table.table[1, 0]),
                    f"{variable}_True_in_platemap": int(table.table[1, 1]),
                    "odds_ratio": odds_ratio,
                    "pvalue": pvalue,
                },
                index=[0],
            ),
        ],
        ignore_index=True,
    )

output_df["adjusted_pvalue"] = statsmodels.stats.multitest.fdrcorrection(output_df.pvalue)[1]

In [6]:
print(
    output_df.sort_values(by="adjusted_pvalue")
    .query("adjusted_pvalue < 0.05")
    .to_markdown(index=False)
)

output_df.to_csv(f"output/{variable}_fisher_exact_orf.tsv", sep="\t", index=False)

| platemap    |   expression_False_in_other_platemaps |   expression_False_in_platemap |   expression_True_in_other_platemaps |   expression_True_in_platemap |   odds_ratio |      pvalue |   adjusted_pvalue |
|:------------|--------------------------------------:|-------------------------------:|-------------------------------------:|------------------------------:|-------------:|------------:|------------------:|
| platemap_43 |                                 11521 |                           3611 |                                  320 |                            25 |     0.24926  | 1.30654e-15 |       5.87942e-14 |
| platemap_30 |                                 11532 |                           3603 |                                  309 |                            33 |     0.341819 | 4.04272e-11 |       9.09612e-10 |
| platemap_39 |                                 11539 |                           3595 |                                  302 |                            41 |     