In [2]:
from pathlib import Path
import traceback
import pandas as pd
import numpy as np
import re
from scipy.stats import fisher_exact

## Class initialization

In [3]:
class FilterTSV:
   def create_mask(self, df, colnames):
      """
      NOTES:
      * Select columns that contain "Deletions" and put them in a list
      * Use set() to remove duplicates, since sets can only contain unique vals
      * Pass column names in list to dataframe to create a mask that drops rows
        where Deletions == 0 and there are nulls
      """
      del_list = set([col for col in colnames if re.search(r"Deletions", col)])
      mask = (df[del_list] != 0).all(axis=1) & (~df.isnull().any(axis=1))
      return mask

   def merged_output(self, df_merged, rep_list, pattern_dict):
      """
      PURPOSE:
      1. Takes all columns from merged df and organizes them by BS/NBS type 
      2. Sums up corresponding bases/deletions & creates 4 new columns per replicate
      3. Selects the new columns
         * Reshapes each row into 2x2 matrix
         * Runs Fisher's Exact Test
         * Appends p-val column
      """
      try:
         for rep in rep_list:
            ## Define names of summed BS/NBS columns
            new_cols = [f"{rep}_TotalBases_BS", 
                        f"{rep}_TotalBases_NBS"]
            
            ## Define generic entries for 2x2 contingency table
            fisher_cols = [f"{rep}_TotalBases_BS", 
                           f"{rep}_Deletions_BS", 
                           f"{rep}_TotalBases_NBS", 
                           f"{rep}_Deletions_NBS"]

            ## Calculate p-values
            """
            PART I: For each replicate, find total bases for BS/NBS
            * Zips pattern_dict and new_cols together
              -> Reminder: pattern_dict = {[List of BS colnames], [List of NBS colnames]}
                           new_cols = ["TotalBases_BS", "TotalBases_NBS"]
            * Sum of all entries in 1st list of pattern_dict 
              -> Stored under 1st colname in new_cols
            * Sum of all entries in 2nd list of pattern_dict 
              -> Stored under 2nd colname in new_cols 
            """
            for col, key in zip(new_cols, pattern_dict):
               if col not in df_merged.columns:
                  df_merged[col] = df_merged[pattern_dict[key]].sum(axis=1)

            """
            PART II: Run Fisher's Exact Test using 2x2 table
            * For each row in df_merged:
              -> Select the specified columns from fisher_cols
              -> Reshape the 4 columns into separate 3D arrays of size 2x2
                 (these will be our 2x2 tables)
              -> Use these arrays for numpy batch processing
            * Run Fisher's Exact Test (scipy) on each table, then select second result
              of test AKA the p-val using 'fisher_exact(table)[1]'
            """
            if set(fisher_cols).issubset(df_merged.columns):
               df_merged = df_merged.dropna(subset = fisher_cols)
               arr = df_merged[fisher_cols].values.reshape(-1, 2, 2) 
               pvals = [fisher_exact(table)[1] for table in arr]
               df_merged[f"{rep}_Pvalue"] = pvals
         return df_merged
      except Exception as e:
         print(f"Failed to calculate p-value for {rep}: {e}")
         traceback.print_exc()
         raise

   def filter_means(self, df_filtered, colname, cols):
      """
      PURPOSE:
      * Use to filter by average (Cutoffs #4-6)
      """
      ## Calculate average and standard deviation
      df_filtered[colname] = df_filtered[cols].mean(axis = 1)      
      std_colname = colname.replace("Avg", "Std")
      df_filtered[std_colname] = df_filtered[cols].std(axis = 1)

      ## Sort by descending DeletionRate
      df_filtered = df_filtered.sort_values(by = colname, 
                                            ascending = False)

      ## If BS, apply filters to average columns
      if "_BS" in colname:
         if "DeletionCt" in colname:
            df_filtered[colname] = df_filtered[colname].ge(5)
         elif "DeletionRate" in colname:
            df_filtered[colname] = df_filtered[colname].ge(0.02)

      return df_filtered

   def filtered_output(self, df_merged, rep_list):
      """
      PURPOSE:
      a) Adds cutoffs from BID-Pipe protocol:
         1. Pvalue across all replicates <= 0.0001
         2. RealRate across all replicates >= 0.3
         3. Total sequencing coverage for each BS and NBS replicate >= 20
         4. Average Deletions across all BS replicates >= 5
         5. Average DeletionRate across all BS replicates >= 0.02
         6. Average DeletionRate is 2x higher in BS compared to NBS
      b) Saves filtered and discarded rows in separate dataframes
      """
      try:
         ## Cutoff 1: Pvalue
         pval_list = [col for col in df_merged.columns 
                      if re.search(r"Pvalue$", col)]
         cutoff1 = df_merged[pval_list].le(0.0001).all(axis=1)
         df_filtered = df_merged.loc[cutoff1]

         ## Cutoff 2: RealRate
         realrate_list = [col for col in df_filtered.columns 
                          if re.search(r"RealRate", col)]
         cutoff2 = df_filtered[realrate_list].ge(0.3).all(axis=1)
         df_filtered = df_filtered.loc[cutoff2]

         ## Cutoff 3: Total sequencing coverage
         for rep in rep_list:
            for sample in ["BS", "NBS"]:
               coverage_list = [col for col in df_filtered.columns if 
                                re.match(fr"{rep}_(TotalBases|Deletions)_{sample}", col)]
               total_sum = df_filtered[coverage_list].sum(axis = 1)
               cutoff3 = total_sum.ge(20)
               df_filtered = df_filtered.loc[cutoff3]

         ## Cutoff 4: Average Deletions (BS)
         avg_del_bs = "AvgDeletionCt_BS"
         del_col_bs = [col for col in df_filtered.columns 
                       if re.search(r"_Deletions_BS$*", col)]
         df_filtered = self.filter_means(df_filtered, avg_del_bs, del_col_bs)

         ## Cutoff 5: Average DeletionRate (BS)
         avg_dr_bs = "AvgDeletionRate_BS"
         dr_col_bs = [col for col in df_filtered.columns 
                      if re.search(r"_DeletionRate_BS$*", col)]
         df_filtered = self.filter_means(df_filtered, avg_dr_bs, dr_col_bs)

         ## Cutoff 6: Average DeletionRate is 2x higher in BS compared to NBS
         avg_dr_nbs = "AvgDeletionRate_NBS"
         dr_col_nbs = [col for col in df_filtered.columns 
                       if re.search(r"_DeletionRate_NBS$*", col)]
         df_filtered = self.filter_means(df_filtered, avg_dr_nbs, dr_col_nbs)
         
         cutoff6 = df_filtered[avg_dr_bs] >= 2 * df_filtered[avg_dr_nbs]
         df_filtered = df_filtered[cutoff6]

         print("Successfully applied cutoffs.")

         return df_filtered
      
      except Exception as e:
         print(f"Failed to apply cutoffs from BID-Pipe protocol: {e}")
         traceback.print_exc()
         raise

filtertsv = FilterTSV()

## Loading in data

In [4]:
current_path = Path.cwd()
input_dir = current_path/"calculations"
filtertsv = FilterTSV()

## Testing helper functions in FilterTSV()

### Testing create_mask()

In [5]:
## Testing del_list variable

random_list = [{"Chrom": "NW233434", 
                "GenomicModBase": 231, 
                "Rep1_A_BS": 4, 
                "Rep1_C_BS": 1, 
                "Rep1_G_BS": 1, 
                "Rep1_T_BS": 1, 
                "Rep1_Deletions_BS": 2,
                "Rep2_Deletions_BS": 5}, 

                {"Chrom": "NW233434", 
                "GenomicModBase": 231, 
                "Rep1_A_BS": 4, 
                "Rep1_C_BS": 1, 
                "Rep1_G_BS": 1, 
                "Rep1_T_BS": 1, 
                "Rep1_Deletions_BS": 2,
                "Rep2_Deletions_BS": 5},

                {"Chrom": "NW233435", 
                 "GenomicModBase": 331, 
                 "Rep1_A_BS": None, 
                 "Rep1_C_BS": None, 
                 "Rep1_G_BS": None, 
                 "Rep1_T_BS": None, 
                 "Rep1_Deletions_BS": 3,
                 "Rep2_Deletions_BS": 7},

                {"Chrom": "NW233433", 
                "GenomicModBase": 230, 
                "Rep1_A_BS": 3, 
                "Rep1_C_BS": 1, 
                "Rep1_G_BS": 1, 
                "Rep1_T_BS": 1, 
                "Rep1_Deletions_BS": 0,
                "Rep2_Deletions_BS": 5},
                
                {"Chrom": "NW233432", 
                "GenomicModBase": 229, 
                "Rep1_A_BS": 2, 
                "Rep1_C_BS": 6, 
                "Rep1_G_BS": 4, 
                "Rep1_T_BS": 7, 
                "Rep1_Deletions_BS": 0,
                "Rep2_Deletions_BS": 0}]
random_df = pd.DataFrame(random_list)

random_df

Unnamed: 0,Chrom,GenomicModBase,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,Rep1_Deletions_BS,Rep2_Deletions_BS
0,NW233434,231,4.0,1.0,1.0,1.0,2,5
1,NW233434,231,4.0,1.0,1.0,1.0,2,5
2,NW233435,331,,,,,3,7
3,NW233433,230,3.0,1.0,1.0,1.0,0,5
4,NW233432,229,2.0,6.0,4.0,7.0,0,0


In [6]:
random_df = random_df.drop_duplicates()
random_df

Unnamed: 0,Chrom,GenomicModBase,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,Rep1_Deletions_BS,Rep2_Deletions_BS
0,NW233434,231,4.0,1.0,1.0,1.0,2,5
2,NW233435,331,,,,,3,7
3,NW233433,230,3.0,1.0,1.0,1.0,0,5
4,NW233432,229,2.0,6.0,4.0,7.0,0,0


In [88]:
#### CHANGE: added list()
del_list = list(set([col for col in random_df.columns if re.search("Deletions", col)]))

print("NOW TESTING DEL_LIST VARIABLE (order in list doesn't matter)",
      "Expected: ['Rep2_Deletions_BS', 'Rep1_Deletions_BS']",
      f"Actual: {del_list}", sep = "\n")

## Testing mask variable
#### CHANGE: fixed mask
mask = ~(random_df[del_list] == 0).any(axis = 1) & (random_df.notna().all(axis = 1))

NOW TESTING DEL_LIST VARIABLE (order in list doesn't matter)
Expected: ['Rep2_Deletions_BS', 'Rep1_Deletions_BS']
Actual: ['Rep2_Deletions_BS', 'Rep1_Deletions_BS']


In [89]:
print("\nNOW PRINTING ORIGINAL RANDOM_DF")
random_df


NOW PRINTING ORIGINAL RANDOM_DF


Unnamed: 0,Chrom,GenomicModBase,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,Rep1_Deletions_BS,Rep2_Deletions_BS
0,NW233434,231,4.0,1.0,1.0,1.0,2,5
2,NW233435,331,,,,,3,7
3,NW233433,230,3.0,1.0,1.0,1.0,0,5
4,NW233432,229,2.0,6.0,4.0,7.0,0,0


In [90]:
print("\nNOW PRINTING FILTERED RANDOM_DF",
      "Expected: Rows with 0 deletions or nulls should be removed",
      "Actual:", sep = "\n")
good_df = random_df.loc[mask]
good_df


NOW PRINTING FILTERED RANDOM_DF
Expected: Rows with 0 deletions or nulls should be removed
Actual:


Unnamed: 0,Chrom,GenomicModBase,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,Rep1_Deletions_BS,Rep2_Deletions_BS
0,NW233434,231,4.0,1.0,1.0,1.0,2,5


In [None]:
bad_mask = (random_df[random_df[del_list] != 0].all(axis = 1)) & (~random_df.isnull().any(axis = 1))
bad_df = random_df.loc[bad_mask]

if 0 in bad_df[del_list].values:
    print("EXPECTED: Failed to filter out DeletionRate == 0 in bad_df")
else:
    print("NOT EXPECTED: Successfully filtered out DeletionRate == 0 in bad_df")

if 0 in good_df[del_list].values:
    print("NOT EXPECTED: Failed to filter out DeletionRate == 0 in good_df")
else:
    print("EXPECTED: Successfully filtered out DeletionRate == 0 in good_df")

EXPECTED: Failed to filter out DeletionRate == 0 in bad_df
EXPECTED: Successfully filtered out DeletionRate == 0 in good_df


## Testing iterative merging

In [13]:
## Verifying tsv_folder and tsv_list
for subfolder in input_dir.iterdir():
    tsv_folder = input_dir/subfolder/"individual_tsv"

    print(f"\nNOW PRINTING TSV_FOLDER FOR SUBFOLDER: {subfolder.name}", 
          "Expected: Directory for individual_tsv folder", 
          f"Actual: {tsv_folder}", sep = "\n")
    
    if subfolder.is_dir():
        tsv_list = sorted(
            tsv_folder.glob("*.tsv"),
            key = lambda x: int(re.search(r"Rep(\d+)", x.name).group(1))
        )

        print(f"\nNOW PRINTING TSV_LIST FOR SUBFOLDER: {subfolder.name}")
        if (subfolder.name) == "7KO-Cyto":
            print("Expected amount of files: 2")
        else:
            print("Expected amount of files: 1")
        print(f"Actual: {len(tsv_list)}")


NOW PRINTING TSV_FOLDER FOR SUBFOLDER: 7KO-Cyto
Expected: Directory for individual_tsv folder
Actual: c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Cyto\individual_tsv

NOW PRINTING TSV_LIST FOR SUBFOLDER: 7KO-Cyto
Expected amount of files: 2
Actual: 2

NOW PRINTING TSV_FOLDER FOR SUBFOLDER: WT-Cyto
Expected: Directory for individual_tsv folder
Actual: c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\WT-Cyto\individual_tsv

NOW PRINTING TSV_LIST FOR SUBFOLDER: WT-Cyto
Expected amount of files: 1
Actual: 1


In [14]:
## For simplicity's sake, we'll stick to the 7KO-Cyto subfolder for testing
tsv_folder = input_dir/"7KO-Cyto/individual_tsv"

tsv_list = sorted(
    tsv_folder.glob("*.tsv"),
    key = lambda x: int(re.search(r"Rep(\d+)", x.name).group(1))
)

df_list = [pd.read_csv(str(file), sep = "\t") for file in tsv_list]
df1_colnames = df_list[0].columns.tolist()

print(df1_colnames)

['TranscriptID', 'Motif', 'Region', 'Chrom', 'Strand', 'TranscriptPosStart', 'TranscriptPosEnd', 'TranscriptModBase', 'GenomicModBase', 'TranscriptLength', 'DistFromAUG', 'DistFromSTOP', 'DistFromExonStart', 'DistFromExonEnd', 'fit_c', 'fit_s', 'fit_b', 'Rep1_A_BS', 'Rep1_C_BS', 'Rep1_G_BS', 'Rep1_T_BS', 'Rep1_Deletions_BS', 'Rep1_DeletionRate_BS', 'Rep1_RealRate_BS']


In [15]:
# selected_colnames = df1_colnames[0:17]
# init_mask = filtertsv.create_mask(df_list[0], df1_colnames)
# # df_full = df_list[0].loc[init_mask]

In [16]:

    # df1_colnames = df_list[0].columns.tolist()
    # selected_colnames = df1_colnames[0:17]
    # init_mask = filtertsv.create_mask(df_list[0], df1_colnames)
    # df_full = df_list[0].loc[init_mask]

    # """
    # PART II: Iteratively merge remaining dfs
    # * The range [1:] means 'Start from 2nd item in list, and continue 
    #     looping until you reach the end'
    # * Output is df_full, which contains rows from all merged dfs
    # """
    # for i in df_list[1:]:
    #     colnames = df_list[i].columns.tolist()
    #     mask = filtertsv.create_mask(df_list[i], colnames)
    #     df_list[i] = df_list[i].loc[mask]
    #     df_full = pd.merge(df_full, df_list[i], on = selected_colnames, how = "outer")

    # ## Sort column names
    # """
    # PART I: Collect all column names
    # """
    # merged_colnames = df_full.columns.tolist()

    # """
    # PART II: Sort by replicate order
    # * Select columns that contain "Rep\d+", such as Rep1, Rep2, Rep3, etc.,
    #     and put them in a list
    # * Use set() to remove duplicates, since sets can only contain unique vals
    # * On those columns, select the first RegEx match group. In this case,
    #     it'd be "\d+" or the digit, such as 1, 2, 3, etc.
    # * Sort columns by those digits in ascending order using sorted() 
    # """
    # rep_list = sorted(
    #     set([re.search(r"(Rep\d+)", col).group(1) for col in merged_colnames 
    #         if re.search(r"(Rep\d+)", col)]), 
    #         key = lambda x: int(re.search(r"Rep(\d+)", x).group(1))
    # )

    # """
    # NOTES:
    # * Drop nulls to ensure deletion rates for all 3 replicates
    # * Calculate p-vals with Fisher's Exact Test
    # * Sort DeletionRate by descending order
    # * Keep only first 50 rows
    # """
    # ## Group corresponding BS/NBS into separate lists
    # df_merged = df_full.dropna()
    # for rep in rep_list:
    #     bs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_BS$")
    #     nbs_base_pattern = re.compile(fr"{rep}_(A|C|G|T)_NBS$")
    #     pattern_dict = {f"{rep}_Bases_BS": [col for col in merged_colnames 
    #                                         if bs_base_pattern.match(col)],
    #                     f"{rep}_Bases_NBS": [col for col in merged_colnames 
    #                                         if nbs_base_pattern.match(col)]}

    # ## Run Fisher's Exact Test (p-values)
    # filtertsv.merged_output(df_merged, rep_list, pattern_dict)