In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels

In [2]:
variable = "expression"
col_name = f"Metadata_{variable}"
col_name_in_metadata = "Metadata_TPM"
output_df = pd.DataFrame()

In [3]:
crispr_plate_metadata_df = pd.read_csv(
    "output/crispr_well_plate_platemap.csv.gz",
    usecols=["Metadata_JCP2022", "Metadata_Platemap"],
)
crispr_plate_metadata_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_Platemap
0,JCP2022_085227,platemap_1
1,JCP2022_800002,platemap_1
2,JCP2022_800573,platemap_1
3,JCP2022_806794,platemap_1
4,JCP2022_802800,platemap_1


In [4]:
crispr_annotation_df = (
    pd.read_csv(
        "../00.download-and-process-annotations/output/crispr_metadata.tsv.gz",
        sep="\t",
        usecols=["Metadata_JCP2022", col_name_in_metadata],
    )
    .assign(
        col=lambda x: np.where(
            x[col_name_in_metadata] > np.nanpercentile(x.Metadata_TPM, 75),
            "True",
            "False",
        )
    )
    .rename(columns={"col": col_name})
)

crispr_df = (
    crispr_plate_metadata_df.merge(crispr_annotation_df, on="Metadata_JCP2022", how="left")
    .dropna(subset=[col_name])
    .drop_duplicates(subset=["Metadata_JCP2022", "Metadata_Platemap"])
)

crispr_df.head()

Unnamed: 0,Metadata_JCP2022,Metadata_Platemap,Metadata_TPM,Metadata_expression
1,JCP2022_800002,platemap_1,,False
2,JCP2022_800573,platemap_1,4.354734,False
3,JCP2022_806794,platemap_1,8.083001,True
4,JCP2022_802800,platemap_1,4.522307,False
5,JCP2022_802216,platemap_1,6.104337,True


Fisher's exact test

In [5]:
fisher_df = pd.DataFrame(
    columns=[
        "platemap",
        f"{variable}_False_in_other_platemaps"
        f"{variable}_False_in_platemap",
        f"{variable}_True_in_other_platemaps",
        f"{variable}_True_in_platemap",
        "odds_ratio",
        "pvalue",
        "adjusted_pvalue",
    ]
)

for platemap in crispr_df.Metadata_Platemap.unique():
    df = crispr_df.assign(platemap_x = lambda x: np.where(x.Metadata_Platemap == platemap, True, False))
    table = sm.stats.Table.from_data(df[["platemap_x", col_name]])

    odds_ratio, pvalue = stats.fisher_exact(table.table)

    output_df = pd.concat(
        [
            output_df,
            pd.DataFrame(
                {
                    "platemap": platemap,
                    f"{variable}_False_in_other_platemaps": int(table.table[0, 0]),
                    f"{variable}_False_in_platemap": int(table.table[0, 1]),
                    f"{variable}_True_in_other_platemaps": int(table.table[1, 0]),
                    f"{variable}_True_in_platemap": int(table.table[1, 1]),
                    "odds_ratio": odds_ratio,
                    "pvalue": pvalue,
                },
                index=[0],
            ),
        ],
        ignore_index=True,
    )

output_df["adjusted_pvalue"] = statsmodels.stats.multitest.fdrcorrection(output_df.pvalue)[1]

In [6]:
print(
    output_df.sort_values(by="adjusted_pvalue")
    .query("adjusted_pvalue < 0.05")
    .to_markdown(index=False)
)

output_df.to_csv(f"output/{variable}_fisher_exact_crispr.tsv", sep="\t", index=False)

| platemap    |   expression_False_in_other_platemaps |   expression_False_in_platemap |   expression_True_in_other_platemaps |   expression_True_in_platemap |   odds_ratio |      pvalue |   adjusted_pvalue |
|:------------|--------------------------------------:|-------------------------------:|-------------------------------------:|------------------------------:|-------------:|------------:|------------------:|
| platemap_17 |                                  6052 |                           2106 |                                  300 |                            23 |     0.220317 | 2.04133e-17 |       5.91984e-16 |
| platemap_15 |                                  6059 |                           2099 |                                  293 |                            30 |     0.295558 | 2.74292e-13 |       3.97724e-12 |
| platemap_3  |                                  6138 |                           2021 |                                  214 |                           108 |     