In [1]:
## use RNA-STAR conda environment
from pathlib import Path
import traceback
import pandas as pd
import numpy as np
import re
from scipy.stats import fisher_exact
import pprint as pp

### Functions

In [18]:
def create_mask(df, colnames):
    """
    NOTES:
    * Select columns that contain "Deletions" and put them in a list
    * Use set() to remove duplicates, since sets can only contain unique vals
    * Pass column names in list to dataframe to create a mask that drops rows
    where Deletions == 0 and there are nulls
    """
    del_list = list(set([col for col in colnames if re.search(r"Deletions", col)]))
    mask = ~(df[del_list] == 0).any(axis = 1) & (df.notna().all(axis = 1))
    return mask

### Loading in data

In [3]:
current_path = Path.cwd()
input_dir = current_path/"merged"
subfolder = "7KO-Cyto"
processed_folder = current_path/"pvals"/subfolder

In [4]:
tsv_list = sorted(input_dir.glob("*.tsv")) 
df_list = [pd.read_csv(str(file), sep = "\t") for file in tsv_list]

### Dataframe merging

In [5]:
df1_colnames = df_list[0].columns.tolist()
selected_colnames = df1_colnames[0:17]
init_mask = create_mask(df_list[0], df1_colnames)
df = df_list[0].loc[init_mask]

df_merged = pd.merge(df, df_list[1], on = selected_colnames, how = "outer").drop_duplicates()

df_merged

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,Rep3_A_NBS,Rep3_C_NBS,Rep3_G_NBS,Rep3_T_NBS,Rep3_Deletions_NBS,Rep3_DeletionRate_NBS,Rep3_RealRate_NBS,Rep3_TotalCoverage_NBS,7KO_AvgDeletionRate_NBS,7KO_StdDeletionRate_NBS
0,rna-NM_000037.4,UAUAG,3UTR,NC_000008.11,-,6439,6444,6441,41655075,8292,...,,,,,,,,,0.750000,
1,rna-NM_000067.3,UUUAG,3UTR,NC_000008.11,+,906,911,908,85480840,1562,...,0.0,0.0,0.0,83.0,2.0,0.023529,0.195605,85.0,0.023529,
2,rna-NM_000077.5,UUUAA,3UTR,NC_000009.12,-,647,652,649,21968080,978,...,,,,,,,,,0.001477,
3,rna-NM_000088.4,UAUAG,3UTR,NC_000017.11,-,5441,5446,5443,50184571,5914,...,,,,,,,,,1.000000,
4,rna-NM_000112.4,UUUAG,3UTR,NC_000005.10,+,3462,3467,3464,149982811,8054,...,,,,,,,,,0.142857,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3846,rna-XM_054330101.1,UUUAG,3UTR,NT_187663.1,-,3729,3734,3731,733481,5185,...,,,,,,,,,0.500000,
3847,rna-XM_054330102.1,UUUAG,3UTR,NT_187663.1,-,3773,3778,3775,733481,5229,...,,,,,,,,,0.500000,
3848,rna-XM_054330103.1,UUUAG,3UTR,NT_187663.1,-,3381,3386,3383,733481,4837,...,,,,,,,,,0.500000,
3849,rna-XM_054330104.1,UUUAG,3UTR,NT_187663.1,-,3721,3726,3723,733481,5177,...,,,,,,,,,0.500000,


In [6]:
merged_colnames = df_merged.columns.tolist()
rep_list = sorted(
    set([re.search(r"(Rep\d+)", col).group(1) for col in merged_colnames 
        if re.search(r"(Rep\d+)", col)]), 
        key = lambda x: int(re.search(r"Rep(\d+)", x).group(1))
)

rep_list

['Rep1', 'Rep2', 'Rep3']

### Calculating p-values (DEBUG)

In [7]:
# ## Testing RegEx column search
# rep = "Rep3"

# wt_7ko = subfolder.split("-")[0]
# cyto_nuc = subfolder.split("-")[1]
# bs_del_col = [col for col in merged_colnames 
#               if re.search(f"{rep}_Deletions_BS", col)]
# nbs_del_col = [col for col in merged_colnames 
#                if re.search(f"{rep}_Deletions_NBS", col)]

# print(f"Expected: 7KO, Actual: {wt_7ko}")
# print(f"Expected: Cyto, Actual: {cyto_nuc}")
# print(f"Expected: {rep}_Deletions_BS, Actual: {bs_del_col[0]}")
# print(f"Expected: {rep}_Deletions_NBS, Actual: {nbs_del_col[0]}")

# ## Testing dictionary
# bs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_BS$")
# nbs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_NBS$")
# pattern_dict = {f"{rep}_Bases_BS": [col for col in merged_colnames 
#                                     if bs_base_pattern.match(col)],
#                 f"{rep}_Bases_NBS": [col for col in merged_colnames 
#                                      if nbs_base_pattern.match(col)]}

# print("\npattern_dict:")
# pp.pprint(pattern_dict)

# ## Testing base_cols and del_cols
# base_cols = [f"{rep}_TotalBases_BS", 
#              f"{rep}_TotalBases_NBS"]
# del_cols = [bs_del_col[0], nbs_del_col[0]]

# print("\nExpected: 2 items in list, Actual:", base_cols)
# print("Expected: 2 items in list, Actual:", del_cols)

# ## Testing fisher_cols
# fisher_cols = [base_cols[0], 
#                del_cols[0], 
#                base_cols[1], 
#                del_cols[1]]

# print("\nfisher_cols:")
# pp.pprint(fisher_cols)

In [8]:
# for col, key in zip(base_cols, pattern_dict):
#     if col not in df_merged.columns:
#         df_merged[col] = df_merged[pattern_dict[key]].sum(axis = 1)

# if set(fisher_cols).issubset(df_merged.columns):
#     df_merged = df_merged.dropna(subset = fisher_cols)
#     arr = df_merged[fisher_cols].values.reshape(-1, 2, 2) 
#     pvals = [fisher_exact(table)[1] for table in arr]
#     df_merged[f"{rep}_Pvalue"] = pvals

# df_merged.drop_duplicates()

In [None]:
## BOOKMARK: 11/18/25
## I fixed the pvalue function

wt_7ko = subfolder.split("-")[0]
cyto_nuc = subfolder.split("-")[1]

def calc_pval(df_merged, merged_colnames, rep_list):
    for rep in rep_list:
        bs_del_col = [col for col in merged_colnames 
                      if re.search(f"{rep}_Deletions_BS", col)]
        nbs_del_col = [col for col in merged_colnames 
                       if re.search(f"{rep}_Deletions_NBS", col)]

        bs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_BS$")
        nbs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_NBS$")
        pattern_dict = {f"{rep}_Bases_BS": [col for col in merged_colnames 
                                            if bs_base_pattern.match(col)],
                        f"{rep}_Bases_NBS": [col for col in merged_colnames 
                                            if nbs_base_pattern.match(col)]}

        base_cols = [f"{rep}_TotalBases_BS", 
                     f"{rep}_TotalBases_NBS"]
        del_cols = [bs_del_col[0], nbs_del_col[0]]

        fisher_cols = [base_cols[0], 
                       del_cols[0], 
                       base_cols[1], 
                       del_cols[1]]
        
        ## Create copy to disable SettingWithCopyWarning
        df_merged = df_merged.copy()

        ## Calculate p-values
        for col, key in zip(base_cols, pattern_dict):
            if col not in df_merged.columns:
                df_merged[col] = df_merged[pattern_dict[key]].sum(axis = 1)

        if set(fisher_cols).issubset(df_merged.columns):
            df_merged = df_merged.dropna(subset = fisher_cols)
            arr = df_merged[fisher_cols].values.reshape(-1, 2, 2) 
            pvals = [fisher_exact(table)[1] for table in arr]
            df_merged[f"{rep}_Pvalue"] = pvals
            
    return df_merged

In [20]:
calc_pval(df_merged, merged_colnames, rep_list)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,7KO_StdDeletionRate_NBS,Rep1_TotalBases_BS,Rep1_TotalBases_NBS,Rep1_Pvalue,Rep2_TotalBases_BS,Rep2_TotalBases_NBS,Rep2_Pvalue,Rep3_TotalBases_BS,Rep3_TotalBases_NBS,Rep3_Pvalue
442,rna-NM_001257293.2-2,UUUAA,3UTR,NW_016107298.1,-,1648,1653,1650,381041,2235,...,0.010085,144.0,116.0,0.0277,9.0,130.0,4.790585e-09,118.0,34.0,0.459433
995,rna-NM_001350334.2,UGUAA,3UTR,NC_000021.9,-,1034,1039,1036,32601765,3448,...,0.164531,10.0,19.0,0.463306,0.0,7.0,0.1090909,5.0,2.0,1.0
997,rna-NM_001350335.2,UGUAA,3UTR,NC_000021.9,-,2193,2198,2195,32601765,2294,...,0.164531,10.0,19.0,0.463306,0.0,7.0,0.1090909,5.0,2.0,1.0
998,rna-NM_001350336.2,UGUAA,3UTR,NC_000021.9,-,1006,1011,1008,32601765,3420,...,0.164531,10.0,19.0,0.463306,0.0,7.0,0.1090909,5.0,2.0,1.0
1000,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.164531,10.0,19.0,0.463306,0.0,7.0,0.1090909,5.0,2.0,1.0
1059,rna-NM_001363572.2-2,UUUAA,3UTR,NW_016107298.1,-,1547,1552,1549,381041,2134,...,0.010085,144.0,116.0,0.0277,9.0,130.0,4.790585e-09,118.0,34.0,0.459433
1083,rna-NM_001364225.2-2,UUUAA,3UTR,NW_016107298.1,-,2084,2089,2086,381041,2671,...,0.010085,144.0,116.0,0.0277,9.0,130.0,4.790585e-09,118.0,34.0,0.459433
1086,rna-NM_001364226.2-2,UUUAA,3UTR,NW_016107298.1,-,2343,2348,2345,381041,2930,...,0.010085,144.0,116.0,0.0277,9.0,130.0,4.790585e-09,118.0,34.0,0.459433
1089,rna-NM_001364227.2-2,UUUAA,3UTR,NW_016107298.1,-,1805,1810,1807,381041,2392,...,0.010085,144.0,116.0,0.0277,9.0,130.0,4.790585e-09,118.0,34.0,0.459433
1092,rna-NM_001364228.2-2,UUUAA,3UTR,NW_016107298.1,-,1557,1562,1559,381041,2144,...,0.010085,144.0,116.0,0.0277,9.0,130.0,4.790585e-09,118.0,34.0,0.459433
