In [1]:
## use RNA-STAR conda environment
from pathlib import Path
import traceback
import pandas as pd
import numpy as np
import re
from scipy.stats import fisher_exact

### Functions

In [2]:
def create_mask(df, colnames):
    """
    NOTES:
    * Select columns that contain "Deletions" and put them in a list
    * Use set() to remove duplicates, since sets can only contain unique vals
    * Pass column names in list to dataframe to create a mask that drops rows
    where Deletions == 0 and there are nulls
    """
    del_list = list(set([col for col in colnames if re.search(r"Deletions", col)]))
    mask = ~(df[del_list] == 0).any(axis = 1) & (df.notna().all(axis = 1))
    return mask

def calc_pval(df_merged, merged_colnames, rep_list):
    for rep in rep_list: 
        bs_del_col = [col for col in merged_colnames 
                        if re.search(f"{rep}_Deletions_BS", col)]
        nbs_del_col = [col for col in merged_colnames 
                        if re.search(f"{rep}_Deletions_NBS", col)]

        ## Group corresponding BS/NBS into separate lists (not modifying original df)
        bs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_BS$")
        nbs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_NBS$")
        pattern_dict = {f"{rep}_Bases_BS": [col for col in merged_colnames 
                                            if bs_base_pattern.match(col)],
                        f"{rep}_Bases_NBS": [col for col in merged_colnames 
                                                if nbs_base_pattern.match(col)]}

        ## Define names of base and deletion BS/NBS columns
        base_cols = [f"{rep}_TotalBases_BS", 
                     f"{rep}_TotalBases_NBS"]
        del_cols = [bs_del_col[0], nbs_del_col[0]]
        
        ## Define generic entries for 2x2 contingency table
        fisher_cols = [base_cols[0], 
                        del_cols[0], 
                        base_cols[1], 
                        del_cols[1]]

        ## Calculate p-values
        for col, key in zip(base_cols, pattern_dict):
            if col not in df_merged.columns:
                df_merged[col] = df_merged[pattern_dict[key]].sum(axis = 1)

        if set(fisher_cols).issubset(df_merged.columns):
            df_merged = df_merged.dropna(subset = fisher_cols)
            arr = df_merged[fisher_cols].values.reshape(-1, 2, 2) 
            pvals = [fisher_exact(table)[1] for table in arr]
            df_merged[f"{rep}_Pvalue"] = pvals

    return df_merged

### Loading in data

In [3]:
current_path = Path.cwd()
input_dir = current_path/"merged"
subfolder = "7KO-Cyto"
processed_folder = current_path/"pvals"/subfolder

In [4]:
tsv_list = sorted(input_dir.glob("*.tsv")) 
df_list = [pd.read_csv(str(file), sep = "\t") for file in tsv_list]

### Dataframe merging

In [5]:
df1_colnames = df_list[0].columns.tolist()
selected_colnames = df1_colnames[0:17]
init_mask = create_mask(df_list[0], df1_colnames)
df = df_list[0].loc[init_mask]

df_merged = pd.merge(df, df_list[1], on = selected_colnames, how = "outer")

In [6]:
merged_colnames = df_merged.columns.tolist()
rep_list = sorted(
    set([re.search(r"(Rep\d+)", col).group(1) for col in merged_colnames 
        if re.search(r"(Rep\d+)", col)]), 
        key = lambda x: int(re.search(r"Rep(\d+)", x).group(1))
)

rep_list

['Rep1', 'Rep2', 'Rep3']

### Calculating p-values (DEBUG)

In [8]:
rep = "Rep1"

wt_7ko = subfolder.split("-")[0]
cyto_nuc = subfolder.split("-")[1]
bs_del_col = [col for col in merged_colnames 
              if re.search(f"{rep}_Deletions_BS", col)]
nbs_del_col = [col for col in merged_colnames 
               if re.search(f"{rep}_Deletions_NBS", col)]

print(f"Expected: 7KO, Actual: {wt_7ko}")
print(f"Expected: Cyto, Actual: {cyto_nuc}")
print(f"Expected: Rep1_Deletions_BS, Actual: {bs_del_col[0]}")
print(f"Expected: Rep1_Deletions_NBS, Actual: {nbs_del_col[0]}")

Expected: 7KO, Actual: 7KO
Expected: Cyto, Actual: Cyto
Expected: Rep1_Deletions_BS, Actual: Rep1_Deletions_BS
Expected: Rep1_Deletions_NBS, Actual: Rep1_Deletions_NBS


In [9]:
bs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_BS$")
nbs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_NBS$")
pattern_dict = {f"{rep}_Bases_BS": [col for col in merged_colnames 
                                    if bs_base_pattern.match(col)],
                f"{rep}_Bases_NBS": [col for col in merged_colnames 
                                     if nbs_base_pattern.match(col)]}

pattern_dict

{'Rep1_Bases_BS': ['Rep1_A_BS', 'Rep1_C_BS', 'Rep1_G_BS', 'Rep1_T_BS'],
 'Rep1_Bases_NBS': ['Rep1_A_NBS', 'Rep1_C_NBS', 'Rep1_G_NBS', 'Rep1_T_NBS']}

In [10]:
base_cols = [f"{rep}_TotalBases_BS", 
             f"{rep}_TotalBases_NBS"]
del_cols = [bs_del_col[0], nbs_del_col[0]]

print("Expected: 2 items in list, Actual:", base_cols)
print("Expected: 2 items in list, Actual:", del_cols)

Expected: 2 items in list, Actual: ['Rep1_TotalBases_BS', 'Rep1_TotalBases_NBS']
Expected: 2 items in list, Actual: ['Rep1_Deletions_BS', 'Rep1_Deletions_NBS']


In [11]:
fisher_cols = [base_cols[0], 
               del_cols[0], 
               base_cols[1], 
               del_cols[1]]

fisher_cols

['Rep1_TotalBases_BS',
 'Rep1_Deletions_BS',
 'Rep1_TotalBases_NBS',
 'Rep1_Deletions_NBS']

In [18]:
for col, key in zip(base_cols, pattern_dict):
    if col not in df_merged.columns:
        df_merged[col] = df_merged[pattern_dict[key]].sum(axis = 1)

In [None]:
## Uncomment the one you want to see
# df_merged[pattern_dict["Rep1_Bases_BS"]].dropna()
# df_merged[pattern_dict["Rep1_Bases_NBS"]].dropna()

Unnamed: 0,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS
29,332.0,0.0,0.0,0.0
243,4233.0,0.0,0.0,0.0
277,106.0,1.0,0.0,0.0
499,332.0,0.0,0.0,0.0
500,332.0,0.0,0.0,0.0
...,...,...,...,...
1082,0.0,211.0,0.0,1880.0
1123,332.0,0.0,0.0,0.0
1174,106.0,1.0,0.0,0.0
1456,332.0,0.0,0.0,0.0


In [None]:
### Calculate p-values
for col, key in zip(base_cols, pattern_dict):
    if col not in df_merged.columns:
        df_merged[col] = df_merged[pattern_dict[key]].sum(axis = 1)

if set(fisher_cols).issubset(df_merged.columns):
    df_merged = df_merged.dropna(subset = fisher_cols)
    arr = df_merged[fisher_cols].values.reshape(-1, 2, 2) 
    pvals = [fisher_exact(table)[1] for table in arr]
    df_merged[f"{rep}_Pvalue"] = pvals

In [48]:
calc_pval(df_merged, merged_colnames, rep_list, subfolder)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,7KO_StdDeletionRate_NBS,7KO_Cyto_Rep1_TotalBases_BS,7KO_Cyto_Rep1_TotalBases_NBS,Rep1_Pvalue,7KO_Cyto_Rep2_TotalBases_BS,7KO_Cyto_Rep2_TotalBases_NBS,Rep2_Pvalue,7KO_Cyto_Rep3_TotalBases_BS,7KO_Cyto_Rep3_TotalBases_NBS,Rep3_Pvalue


In [None]:
## Clearly there's something going wrong here!