In [1]:
## test file paired with KIRA_v2_clean_tsv.py

from pathlib import Path
import traceback
import pandas as pd
import numpy as np
import re
from scipy.stats import fisher_exact

## Class initialization

In [2]:
class FilterTSV:
   def create_mask(self, df, colnames):
      """
      NOTES:
      * Select columns that contain "Deletions" and put them in a list
      * Use set() to remove duplicates, since sets can only contain unique vals
      * Pass column names in list to dataframe to create a mask that drops rows
        where Deletions == 0 and there are nulls
      """
      del_list = list(set([col for col in colnames if re.search(r"Deletions", col)]))
      mask = ~(df[del_list] == 0).any(axis = 1) & (df.notna().all(axis = 1))
      return mask

   def merge_reps(self, suffix, tsv_list, subfolder, reps_dir):
      """
      1. Search TSVs for matching suffix in filename
      2. Put them in list
      3. Read in as pandas dataframes
      """
      matches = [tsv for tsv in tsv_list if re.search(suffix, tsv)]
      df_list = {pd.read_csv(str(file), sep = "\t") for file in matches}

      """
      Copy + paste iterative merging code from original clean_tsv
      because there are 3 replicates
      """
      df1_colnames = df_list[0].columns.tolist()
      selected_colnames = df1_colnames[0:17]
      init_mask = self.create_mask(df_list[0]. df1_colnames)
      merged = df_list[0].loc[init_mask]

      for df in df_list[1:]:
         if not df.empty:
            colnames = df.columns.tolist()
            mask = self.create_mask(df, colnames)
            df = df.loc[mask]
            merged = pd.merge(merged, df,
                              on = selected_colnames,
                              how = "outer")
      
      """
      1. Define col_start and col_end so that concatenation
         results in examples like:
         a. 7KO_AvgDeletionRate_BS
         b. 7KO_StdDeletionRate_BS
      2. Create AvgDeletionRate and StdDeletionRate columns
         in merged df
      """
      col_start = subfolder.name.split("-")[0]
      col_end = suffix.split("-")[0]
      avg_col = col_start + "_AvgDeletionRate_" + col_end
      std_col = col_start + "_StdDeletionRate_" + col_end

      calc_merged = self.calc_avg_sd(merged, avg_col, std_col)

      """
      Save merged dataframe as TSV
      """
      merged_dir = reps_dir/f"{subfolder.name}{suffix}.tsv"
      calc_merged.to_csv(merged_dir, sep = "\t", index = False)

   def merge_WT_7KO(matching_name, merged_reps_tsv, wt_7ko_dir):
      matches = [tsv for tsv in merged_reps_tsv if re.search(matching_name, tsv)]
      df_list = {pd.read_csv(str(file), sep = "\t") for file in matches}

      """
      1. Ensure 7KO is merged with WT, so WT columns appear first
      2. If either dataframe is not empty, then merge w/ inner join
      3. No need to iteratively merge because there are only 2 files
      """
      first_cols = df_list[0].columns

      if re.search("WT", first_cols):
         df1 = df_list[0]
         df2 = df_list[1]
      else:
         df1 = df_list[1]
         df2 = df_list[0]
      
      selected_colnames = df1[0:17]

      if not df1.empty and df2.empty:
         merged = pd.merge(df1, df2, on = selected_colnames, how = "inner")
      elif df1.empty:
         merged = df2
      else:
         merged = df1
      
      """
      1. Create output name
         e.g., 7KO-Cyto-BS -> Cyto-BS
      2. Save merged dataframe as TSV
      """
      separator = "-"
      base = (matches[0].stem).split(separator) ## Obtain ['7KO', 'Cyto', 'BS']
      output_name = separator.join(item for item in base[1:]) ## Obtain Cyto-BS
      
      merged_dir = wt_7ko_dir/f"{output_name}.tsv"
      merged.to_csv(merged_dir, sep = "\t", index = False)

   def merge_BS_NBS(fraction, merged_wt_7ko_tsv, bs_nbs_dir):
      matches = [tsv for tsv in merged_wt_7ko_tsv if re.search(fraction, tsv)]
      df_list = {pd.read_csv(str(file), sep = "\t") for file in matches}

      """
      1. Ensure NBS is merged with BS, so BS columns appear first
      2. If either dataframe is not empty, then merge w/ inner join
      3. No need to iteratively merge because there are only 2 files
      """
      first_cols = df_list[0].columns

      if re.search("_BS", first_cols):
         df1 = df_list[0]
         df2 = df_list[1]
      else:
         df1 = df_list[1]
         df2 = df_list[0]
      
      selected_colnames = df1[0:17]

      if not df1.empty and df2.empty:
         merged = pd.merge(df1, df2, on = selected_colnames, how = "inner")
      elif df1.empty:
         merged = df2
      else:
         merged = df1
      
      """
      1. Create output name
         e.g., Cyto-BS -> Cyto
      2. Save merged dataframe as TSV
      """
      output_name = (matches[0].stem).split("-")[0]
      merged_dir = bs_nbs_dir/f"{output_name}.tsv"
      merged.to_csv(merged_dir, sep = "\t", index = False)

## Loading in data

In [3]:
current_path = Path.cwd()
input_dir = current_path/"calculations"
filtertsv = FilterTSV()

In [4]:
## Verifying tsv_folder and tsv_list
for subfolder in input_dir.iterdir():
    tsv_folder = input_dir/subfolder/"individual_tsv"

    print(f"\nNOW PRINTING TSV_FOLDER FOR SUBFOLDER: {subfolder.name}", 
          "Expected: Directory for individual_tsv folder", 
          f"Actual: {tsv_folder}", sep = "\n")
    
    if subfolder.is_dir():
        tsv_list = sorted(
            tsv_folder.glob("*.tsv"),
            key = lambda x: int(re.search(r"Rep(\d+)", x.name).group(1))
        )

        print(f"\nNOW PRINTING TSV_LIST FOR SUBFOLDER: {subfolder.name}")
        if (subfolder.name) == "7KO-Cyto":
            print("Expected amount of files: 6")
        else:
            print("Expected amount of files: 6")
        print(f"Actual: {len(tsv_list)}")


NOW PRINTING TSV_FOLDER FOR SUBFOLDER: 7KO-Cyto
Expected: Directory for individual_tsv folder
Actual: c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Cyto\individual_tsv

NOW PRINTING TSV_LIST FOR SUBFOLDER: 7KO-Cyto
Expected amount of files: 6
Actual: 6

NOW PRINTING TSV_FOLDER FOR SUBFOLDER: 7KO-Nuc
Expected: Directory for individual_tsv folder
Actual: c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Nuc\individual_tsv

NOW PRINTING TSV_LIST FOR SUBFOLDER: 7KO-Nuc
Expected amount of files: 6
Actual: 5


## Testing helper functions in FilterTSV()

### merge_reps()

In [5]:
## Testing 'matches' variable
print("NOW TESTING MATCHES VARIABLE")
reps_dir = current_path/"merged_reps"

bs_suffix = "-BS"
bs_matches = [tsv for tsv in tsv_list if re.search(bs_suffix, tsv.stem)]
nbs_suffix = "-NBS"
nbs_matches = [tsv for tsv in tsv_list if re.search(nbs_suffix, tsv.stem)]

print("Expected: List of paths where filename contains \"-BS\"",
      "Actual:", *bs_matches, sep = "\n")
print("\nExpected: List of paths where filename contains \"-NBS\"",
      "Actual:", *nbs_matches, sep = "\n")


NOW TESTING MATCHES VARIABLE
Expected: List of paths where filename contains "-BS"
Actual:
c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Nuc\individual_tsv\KEH-Rep1-7KO-HEK293T-Nuc-BS.sorted.tsv
c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Nuc\individual_tsv\KEH-Rep2-7KO-HEK293T-Nuc-BS.sorted.tsv
c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Nuc\individual_tsv\KEH-Rep3-7KO-HEK293T-Nuc-BS.sorted.tsv

Expected: List of paths where filename contains "-NBS"
Actual:
c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Nuc\individual_tsv\KEH-Rep1-7KO-HEK293T-Nuc-NBS.sorted.tsv
c:\Users\Sonia Ling\Desktop\calculate_dr\clean_tsv\testing\calculations\7KO-Nuc\individual_tsv\KEH-Rep3-7KO-HEK293T-Nuc-NBS.sorted.tsv


In [None]:
## Testing 'df_list' variable
bs_list = [pd.read_csv(str(file), sep = "\t") for file in bs_matches]
df1_colnames = bs_list[0].columns.tolist()
selected_colnames = df1_colnames[0:17]
init_mask = filtertsv.create_mask(bs_list[0], df1_colnames)
merged = bs_list[0].loc[init_mask]

In [None]:
## Testing iterative merging
for df in bs_list[1:]:
    if not df.empty:
        colnames = df.columns.tolist()
        mask = filtertsv.create_mask(df, colnames)
        df = df.loc[mask]
        merged = pd.merge(merged, df,
                          on = selected_colnames,
                          how = "outer")

merged.columns

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,Rep2_RealRate_BS,Rep2_TotalCoverage_BS,Rep3_A_BS,Rep3_C_BS,Rep3_G_BS,Rep3_T_BS,Rep3_Deletions_BS,Rep3_DeletionRate_BS,Rep3_RealRate_BS,Rep3_TotalCoverage_BS
0,rna-NM_000071.3,UCUAA,3UTR,NC_000021.9,-,2017,2022,2019,43053666,2495,...,,,182.0,0.0,0.0,0.0,1.0,0.005464,0.040133,183.0
1,rna-NM_000077.5,UUUAA,3UTR,NC_000009.12,-,647,652,649,21968080,978,...,0.296106,135.0,,,,,,,,
2,rna-NM_000084.5,UCUAA,3UTR,NC_000023.11,+,7495,7500,7497,50097256,9472,...,,,,,,,,,,
3,rna-NM_000135.4,UAUAG,3UTR,NC_000016.10,-,5180,5185,5182,89737818,5452,...,,,76.0,8.0,0.0,0.0,1.0,0.011765,0.249398,85.0
4,rna-NM_000135.4,UUUAG,3UTR,NC_000016.10,-,4509,4514,4511,89738489,5452,...,,,65.0,1.0,0.0,2.0,1.0,0.014493,0.125808,69.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9238,rna-XM_054333307.1,UAUAG,3UTR,NW_025791813.1,-,9215,9220,9217,19873,10213,...,,,,,,,,,,
9239,rna-XM_054333618.1,UGUAA,3UTR,NT_187693.1,+,4154,4159,4156,442661,5806,...,,,,,,,,,,
9240,rna-XM_054333619.1,UGUAA,3UTR,NT_187693.1,+,4136,4141,4138,442661,5788,...,,,,,,,,,,
9241,rna-XM_054333620.1,UGUAA,3UTR,NT_187693.1,+,4140,4145,4142,442661,5792,...,,,,,,,,,,


### Testing calc_avg_std()

In [None]:
col_start = subfolder.name.split("-")[0]
col_end = "-BS".split("-")[1]
avg_col = col_start + "_AvgDeletionRate_" + col_end
std_col = col_start + "_StdDeletionRate_" + col_end

In [96]:
dr_col = [col for col in merged.columns if re.search("_DeletionRate_", col)]
merged[avg_col] = merged[dr_col].mean(axis = 1)
merged[std_col] = merged[dr_col].std(axis = 1)
merged[merged.columns[17:]].head(5)

Unnamed: 0,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,Rep1_Deletions_BS,Rep1_DeletionRate_BS,Rep1_RealRate_BS,Rep1_TotalCoverage_BS,Rep2_A_BS,Rep2_C_BS,...,Rep3_A_BS,Rep3_C_BS,Rep3_G_BS,Rep3_T_BS,Rep3_Deletions_BS,Rep3_DeletionRate_BS,Rep3_RealRate_BS,Rep3_TotalCoverage_BS,7KO_AvgDeletionRate_BS,7KO_StdDeletionRate_BS
0,,,,,,,,,,,...,182.0,0.0,0.0,0.0,1.0,0.005464,0.040133,183.0,0.005464,
1,,,,,,,,,128.0,0.0,...,,,,,,,,,0.051852,
2,0.0,0.0,0.0,21.0,1.0,0.045455,0.304155,22.0,,,...,,,,,,,,,0.045455,
3,,,,,,,,,,,...,76.0,8.0,0.0,0.0,1.0,0.011765,0.249398,85.0,0.011765,
4,73.0,0.0,0.0,0.0,3.0,0.039474,0.297856,76.0,,,...,65.0,1.0,0.0,2.0,1.0,0.014493,0.125808,69.0,0.026983,0.017664


In [None]:
# This means that rows with NULL standard deviation only have 1 or less DeletionRate columns
merged.iloc[5].dropna()

TranscriptID              rna-NM_000141.5
Motif                               UUUAA
Region                               3UTR
Chrom                        NC_000010.11
Strand                                  -
TranscriptPosStart                   4321
TranscriptPosEnd                     4326
TranscriptModBase                    4323
GenomicModBase                  121478632
TranscriptLength                     4624
DistFromAUG                          3690
DistFromSTOP                         1225
DistFromExonStart                    1389
DistFromExonEnd                       300
fit_c                            0.967359
fit_s                            0.866394
fit_b                               0.001
Rep3_A_BS                            47.0
Rep3_C_BS                             0.0
Rep3_G_BS                             0.0
Rep3_T_BS                             0.0
Rep3_Deletions_BS                     4.0
Rep3_DeletionRate_BS             0.078431
Rep3_RealRate_BS                 0