In [3]:
## test file paired with KIRA_v2_clean_tsv.py

from pathlib import Path
import traceback
import pandas as pd
import numpy as np
import re
from scipy.stats import fisher_exact

## Class initialization

In [5]:
class FilterTSV:
   def create_mask(self, df, colnames):
      """
      NOTES:
      * Select columns that contain "Deletions" and put them in a list
      * Use set() to remove duplicates, since sets can only contain unique vals
      * Pass column names in list to dataframe to create a mask that drops rows
        where Deletions == 0 and there are nulls
      """
      del_list = list(set([col for col in colnames if re.search(r"Deletions", col)]))
      mask = ~(df[del_list] == 0).any(axis = 1) & (df.notna().all(axis = 1))
      return mask

   def merge_reps(self, suffix, tsv_list, subfolder, reps_dir):
      """
      1. Search TSVs for matching suffix in filename
      2. Put them in list
      3. Read in as pandas dataframes
      """
      matches = [tsv for tsv in tsv_list if re.search(suffix, tsv)]
      df_list = {pd.read_csv(str(file), sep = "\t") for file in matches}

      """
      Copy + paste iterative merging code from original clean_tsv
      because there are 3 replicates
      """
      df1_colnames = df_list[0].columns.tolist()
      selected_colnames = df1_colnames[0:17]
      init_mask = self.create_mask(df_list[0]. df1_colnames)
      merged = df_list[0].loc[init_mask]

      for df in df_list[1:]:
         if not df.empty:
            colnames = df.columns.tolist()
            mask = self.create_mask(df, colnames)
            df = df.loc[mask]
            merged = pd.merge(merged, df,
                              on = selected_colnames,
                              how = "outer")
      
      """
      1. Define col_start and col_end so that concatenation
         results in examples like:
         a. 7KO_AvgDeletionRate_BS
         b. 7KO_StdDeletionRate_BS
      2. Create AvgDeletionRate and StdDeletionRate columns
         in merged df
      """
      col_start = subfolder.name.split("-")[0]
      col_end = suffix.split("-")[0]
      avg_col = col_start + "_AvgDeletionRate_" + col_end
      std_col = col_start + "_StdDeletionRate_" + col_end

      calc_merged = self.calc_avg_sd(merged, avg_col, std_col)

      """
      Save merged dataframe as TSV
      """
      merged_dir = reps_dir/f"{subfolder.name}{suffix}.tsv"
      calc_merged.to_csv(merged_dir, sep = "\t", index = False)

   def merge_WT_7KO(matching_name, merged_reps_tsv, wt_7ko_dir):
      matches = [tsv for tsv in merged_reps_tsv if re.search(matching_name, tsv)]
      df_list = {pd.read_csv(str(file), sep = "\t") for file in matches}

      """
      1. Ensure 7KO is merged with WT, so WT columns appear first
      2. If either dataframe is not empty, then merge w/ inner join
      3. No need to iteratively merge because there are only 2 files
      """
      first_cols = df_list[0].columns

      if re.search("WT", first_cols):
         df1 = df_list[0]
         df2 = df_list[1]
      else:
         df1 = df_list[1]
         df2 = df_list[0]
      
      selected_colnames = df1[0:17]

      if not df1.empty and df2.empty:
         merged = pd.merge(df1, df2, on = selected_colnames, how = "inner")
      elif df1.empty:
         merged = df2
      else:
         merged = df1
      
      """
      1. Create output name
         e.g., 7KO-Cyto-BS -> Cyto-BS
      2. Save merged dataframe as TSV
      """
      separator = "-"
      base = (matches[0].stem).split(separator) ## Obtain ['7KO', 'Cyto', 'BS']
      output_name = separator.join(item for item in base[1:]) ## Obtain Cyto-BS
      
      merged_dir = wt_7ko_dir/f"{output_name}.tsv"
      merged.to_csv(merged_dir, sep = "\t", index = False)

   def merge_BS_NBS(fraction, merged_wt_7ko_tsv, bs_nbs_dir):
      matches = [tsv for tsv in merged_wt_7ko_tsv if re.search(fraction, tsv)]
      df_list = {pd.read_csv(str(file), sep = "\t") for file in matches}

      """
      1. Ensure NBS is merged with BS, so BS columns appear first
      2. If either dataframe is not empty, then merge w/ inner join
      3. No need to iteratively merge because there are only 2 files
      """
      first_cols = df_list[0].columns

      if re.search("_BS", first_cols):
         df1 = df_list[0]
         df2 = df_list[1]
      else:
         df1 = df_list[1]
         df2 = df_list[0]
      
      selected_colnames = df1[0:17]

      if not df1.empty and df2.empty:
         merged = pd.merge(df1, df2, on = selected_colnames, how = "inner")
      elif df1.empty:
         merged = df2
      else:
         merged = df1
      
      """
      1. Create output name
         e.g., Cyto-BS -> Cyto
      2. Save merged dataframe as TSV
      """
      output_name = (matches[0].stem).split("-")[0]
      merged_dir = bs_nbs_dir/f"{output_name}.tsv"
      merged.to_csv(merged_dir, sep = "\t", index = False)

## Loading in data

In [None]:
current_path = Path.cwd()
input_dir = current_path/"calculations"
filtertsv = FilterTSV()

In [7]:
## Verifying tsv_folder and tsv_list
for subfolder in input_dir.iterdir():
    tsv_folder = input_dir/subfolder/"individual_tsv"

    print(f"\nNOW PRINTING TSV_FOLDER FOR SUBFOLDER: {subfolder.name}", 
          "Expected: Directory for individual_tsv folder", 
          f"Actual: {tsv_folder}", sep = "\n")
    
    if subfolder.is_dir():
        tsv_list = sorted(
            tsv_folder.glob("*.tsv"),
            key = lambda x: int(re.search(r"Rep(\d+)", x.name).group(1))
        )

        print(f"\nNOW PRINTING TSV_LIST FOR SUBFOLDER: {subfolder.name}")
        if (subfolder.name) == "7KO-Cyto":
            print("Expected amount of files: 6")
        else:
            print("Expected amount of files: 6")
        print(f"Actual: {len(tsv_list)}")


NOW PRINTING TSV_FOLDER FOR SUBFOLDER: 7KO-Cyto
Expected: Directory for individual_tsv folder
Actual: c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Cyto\individual_tsv

NOW PRINTING TSV_LIST FOR SUBFOLDER: 7KO-Cyto
Expected amount of files: 6
Actual: 6

NOW PRINTING TSV_FOLDER FOR SUBFOLDER: 7KO-Nuc
Expected: Directory for individual_tsv folder
Actual: c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Nuc\individual_tsv

NOW PRINTING TSV_LIST FOR SUBFOLDER: 7KO-Nuc
Expected amount of files: 6
Actual: 6


## Testing helper functions in FilterTSV()

### merge_reps()

In [None]:
### NOTE: Not working as expected! Should only return paths with BS, not NBS
reps_dir = current_path/"merged_reps"
suffix = "BS"
matches = [tsv for tsv in tsv_list if re.search(suffix, tsv.stem)]
# TODO
# filtertsv.merge_reps(suffix, tsv_list, subfolder, reps_dir)

[WindowsPath('c:/Users/Sonia Ling/Desktop/calculate_dr/clean_tsv/testing/calculations/7KO-Nuc/individual_tsv/KEH-Rep1-7KO-HEK293T-Nuc-BS.sorted.tsv'),
 WindowsPath('c:/Users/Sonia Ling/Desktop/calculate_dr/clean_tsv/testing/calculations/7KO-Nuc/individual_tsv/KEH-Rep1-7KO-HEK293T-Nuc-NBS.sorted.tsv'),
 WindowsPath('c:/Users/Sonia Ling/Desktop/calculate_dr/clean_tsv/testing/calculations/7KO-Nuc/individual_tsv/KEH-Rep2-7KO-HEK293T-Nuc-BS.sorted.tsv'),
 WindowsPath('c:/Users/Sonia Ling/Desktop/calculate_dr/clean_tsv/testing/calculations/7KO-Nuc/individual_tsv/KEH-Rep2-7KO-HEK293T-Nuc-NBS.sorted.tsv'),
 WindowsPath('c:/Users/Sonia Ling/Desktop/calculate_dr/clean_tsv/testing/calculations/7KO-Nuc/individual_tsv/KEH-Rep3-7KO-HEK293T-Nuc-BS.sorted.tsv'),
 WindowsPath('c:/Users/Sonia Ling/Desktop/calculate_dr/clean_tsv/testing/calculations/7KO-Nuc/individual_tsv/KEH-Rep3-7KO-HEK293T-Nuc-NBS.sorted.tsv')]

def merge_reps(tsv_list, subfolder, reps_dir):
    """
    1. Search TSVs for matching suffix in filename
    2. Put them in list
    3. Read in as pandas dataframes
    """
    matches = [tsv for tsv in tsv_list if re.search(suffix, tsv)]
    df_list = {pd.read_csv(str(file), sep = "\t") for file in matches}

    """
    Copy + paste iterative merging code from original clean_tsv
    because there are 3 replicates
    """
    df1_colnames = df_list[0].columns.tolist()
    selected_colnames = df1_colnames[0:17]
    init_mask = self.create_mask(df_list[0]. df1_colnames)
    merged = df_list[0].loc[init_mask]

    for df in df_list[1:]:
        if not df.empty:
        colnames = df.columns.tolist()
        mask = self.create_mask(df, colnames)
        df = df.loc[mask]
        merged = pd.merge(merged, df,
                            on = selected_colnames,
                            how = "outer")
    
    """
    1. Define col_start and col_end so that concatenation
        results in examples like:
        a. 7KO_AvgDeletionRate_BS
        b. 7KO_StdDeletionRate_BS
    2. Create AvgDeletionRate and StdDeletionRate columns
        in merged df
    """
    col_start = subfolder.name.split("-")[0]
    col_end = suffix.split("-")[0]
    avg_col = col_start + "_AvgDeletionRate_" + col_end
    std_col = col_start + "_StdDeletionRate_" + col_end

    calc_merged = self.calc_avg_sd(merged, avg_col, std_col)

    """
    Save merged dataframe as TSV
    """
    merged_dir = reps_dir/f"{subfolder.name}{suffix}.tsv"
    calc_merged.to_csv(merged_dir, sep = "\t", index = False)

In [None]:
suffix = "BS"