In [1]:
from pathlib import Path
import traceback
import pandas as pd
import numpy as np
import re

## Disable .loc indexing warning
pd.options.mode.chained_assignment = None

### Loading in data

In [2]:
## Load in file
current_path = Path.cwd()
df = pd.read_csv(current_path/"KEH-Rep2-7KO-HEK293T-Cyto-BS.sorted.tsv", sep = "\t")

df.head(5)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,fit_s,fit_b,Rep2_A_BS,Rep2_C_BS,Rep2_G_BS,Rep2_T_BS,Rep2_Deletions_BS,Rep2_DeletionRate_BS,Rep2_RealRate_BS,TotalCoverage
0,rna-XM_005245453.2,UUUAG,3UTR,NC_000001.11,-,3857,3862,3859,205715358,6396,...,0.901845,0.001,249,0,0,0,24,0.087912,0.506265,273
1,rna-NM_022731.5,UUUAG,3UTR,NC_000001.11,-,3860,3865,3862,205715358,6399,...,0.901845,0.001,249,0,0,0,24,0.087912,0.506265,273
2,rna-NM_012458.4,UAUAG,3UTR,NC_000019.10,-,1250,1255,1252,2426036,1664,...,0.966038,0.001,8,15,0,0,2,0.08,0.735916,25
3,rna-NM_001322267.2,UCUAA,3UTR,NC_000009.12,+,1730,1735,1732,113291699,2939,...,0.890419,0.001,0,0,0,25,2,0.074074,0.426971,27
4,rna-NM_004697.5,UCUAA,3UTR,NC_000009.12,+,1691,1696,1693,113291699,2900,...,0.890419,0.001,0,0,0,25,2,0.074074,0.426971,27


In [19]:
## Columns that are always the same across all TSVs, excluding the best-fit values
constant_cols = df.columns.tolist()[0:14]

In [20]:
df_clean = df.dropna()

In [26]:
subfolder = "KEH-Rep2-7KO-HEK293T-Cyto-BS_star_realigned"
folder_name = "7KO-Cyto-BS_processed_fastqs"
print(subfolder, folder_name, sep = "\n")

KEH-Rep2-7KO-HEK293T-Cyto-BS_star_realigned
7KO-Cyto-BS_processed_fastqs


### Testing make_key()

In [27]:
## Function definition
def make_key(subfolder, base_key):
    """
    PURPOSE:
    Modifies names of dictionary keys based on the Rep # (detected via RegEx)
    and Sample Type (BS, NBS) in a given subfolder name.
    ---
    NOTES:
    * sorted(set(rep_matches)): Removes duplicate reps, sorts in ascending order
    * for rep in rep_list: Adds replicate prefix to dict key names
    * for sample in ['BS', 'NBS']: Adds sample type suffix to dict key names
    """
    rep_matches = re.findall(r"Rep\d+", str(subfolder))
    rep_list = sorted(set(rep_matches))
    
    for rep in rep_list:
        if f"-{rep}-" in str(subfolder):
            prefix = rep + "_"
            break
    
    for sample in ["BS", "NBS"]:
        if f"-{sample}_" in str(subfolder):
            suffix = "_" + sample
            break
    
    return prefix + base_key + suffix

"""
PURPOSE:
* Showcases how rep_list is supposed to behave
* Reps should be listed in integer order
---
NOTES: 
* These variables are not used outside of
  this cell; for demonstration purposes only
"""

## Test 1: Rep1, Rep2, Rep3
folder_list = ["KEH-Rep3-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep2-7KO-HEK293T-Cyto-BS_star_realigned"]
rep_matches = re.findall(r"Rep\d+", str(folder_list))
rep_list = sorted(set(rep_matches), key = lambda x: int(x[3:]))

print("NOW TESTING REP_LIST")
print(f"Expected: ['Rep1', 'Rep2', 'Rep3'], Actual: {rep_list}")

## Test 2: Rep1, Rep5, Rep10
folder_list = ["KEH-Rep10-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep5-7KO-HEK293T-Cyto-BS_star_realigned"]
rep_matches = re.findall(r"Rep\d+", str(folder_list))
rep_list = sorted(set(rep_matches), key = lambda x: int(x[3:]))

print("\nNOW TESTING REP_LIST")
print(f"Expected: ['Rep1', 'Rep5', 'Rep10'], Actual: {rep_list}")

NOW TESTING REP_LIST
Expected: ['Rep1', 'Rep2', 'Rep3'], Actual: ['Rep1', 'Rep2', 'Rep3']

NOW TESTING REP_LIST
Expected: ['Rep1', 'Rep5', 'Rep10'], Actual: ['Rep1', 'Rep5', 'Rep10']


In [28]:
key = {base_key: make_key(subfolder, base_key) for base_key 
       in ["A", "C", "G", "T", "Deletions", "DeletionRate", "RealRate", "TotalCoverage"]}

print("NOW TESTING KEY VARIABLE",
      "---", sep = "\n")
print(f"1. Recall that subfolder: {subfolder}",
      "2. From here, we select out Rep1 and BS",
      f"3. Expected format: Rep1_(A|C|G|T|Deletions|DeletionRate|RealRate|TotalCoverage)_BS", sep = "\n")
key

NOW TESTING KEY VARIABLE
---
1. Recall that subfolder: KEH-Rep2-7KO-HEK293T-Cyto-BS_star_realigned
2. From here, we select out Rep1 and BS
3. Expected format: Rep1_(A|C|G|T|Deletions|DeletionRate|RealRate|TotalCoverage)_BS


{'A': 'Rep2_A_BS',
 'C': 'Rep2_C_BS',
 'G': 'Rep2_G_BS',
 'T': 'Rep2_T_BS',
 'Deletions': 'Rep2_Deletions_BS',
 'DeletionRate': 'Rep2_DeletionRate_BS',
 'RealRate': 'Rep2_RealRate_BS',
 'TotalCoverage': 'Rep2_TotalCoverage_BS'}

In [29]:
## Dynamically rename columns in df_clean
df_final = df_clean.rename(columns = {"A": key["A"], 
                                      "C": key["C"], 
                                      "G": key["G"], 
                                      "T": key["T"], 
                                      "Deletions": key["Deletions"], 
                                      "DeletionRate": key["DeletionRate"], 
                                      "RealRate": key["RealRate"],
                                      "TotalCoverage": key["TotalCoverage"]})

## Verify that it worked
diff_cols = df_final.columns.difference(constant_cols, sort = False)
df_final[diff_cols[3:]].head(1)

Unnamed: 0,Rep2_A_BS,Rep2_C_BS,Rep2_G_BS,Rep2_T_BS,Rep2_Deletions_BS,Rep2_DeletionRate_BS,Rep2_RealRate_BS,Rep2_TotalCoverage_BS
0,249,0,0,0,24,0.087912,0.506265,273


### Testing get_sample_group()

In [30]:
## Function definition
def get_sample_group(folder_name):
    """
    PURPOSE:
    Given input folder names, extract the group name
    by returning the first capture group in RegEx.
    ---
    EXAMPLE: 
    '7KO-Cyto-BS_processed_fastqs' -> '7KO-Cyto'
    """
    match = re.match(r"(.+)-(?:BS|NBS)_processed_fastqs", folder_name)
    return match.group(1)

In [31]:
print("NOW TESTING GET_SAMPLE_GROUP()",
      f"Recall that folder_name: {folder_name}", 
      "---",sep = "\n")
print("Expected: 7KO-Cyto, Actual:", get_sample_group(folder_name))
print("Expected: 7KO-Cyto, Actual:", get_sample_group("7KO-Cyto-NBS_processed_fastqs"))
print("Expected: WT-Nuc, Actual:", get_sample_group("WT-Nuc-BS_processed_fastqs"))
print("Expected: WT-Nuc, Actual:", get_sample_group("WT-Nuc-NBS_processed_fastqs"))

NOW TESTING GET_SAMPLE_GROUP()
Recall that folder_name: 7KO-Cyto-BS_processed_fastqs
---
Expected: 7KO-Cyto, Actual: 7KO-Cyto
Expected: 7KO-Cyto, Actual: 7KO-Cyto
Expected: WT-Nuc, Actual: WT-Nuc
Expected: WT-Nuc, Actual: WT-Nuc


### Testing filtering & cutoffs

In [33]:
## Filter by DeletionRate based on keywords in: 
## (a) folder_name
## (b) DeletionRate column (accessed via key)
"""
WT:
* BS files must have DeletionRate values of >= 0.8
* NBS files must have DeletionRate values of <= 0.1
---
Mutation (PUS7KO):
* BS files must have DeletionRate values of <= 0.1
"""
print("NOW TESTING COLUMN NAME RETRIEVAL",
      "---", sep = "\n")
dr_pattern = key["DeletionRate"]
print("Expected: Rep2_DeletionRate_BS", 
      f"Actual: {dr_pattern}", sep = "\n")

rr_pattern = key["RealRate"]
print("\nExpected: Rep2_RealRate_BS", 
      f"Actual: {rr_pattern}", sep = "\n")

NOW TESTING COLUMN NAME RETRIEVAL
---
Expected: Rep2_DeletionRate_BS
Actual: Rep2_DeletionRate_BS

Expected: Rep2_RealRate_BS
Actual: Rep2_RealRate_BS


In [34]:
## Testing RealRate filtering
print("NOW TESTING REALRATE FILTERING",
      "---", sep = "\n")
kept_rr = df_final[df_final[rr_pattern].ge(0.3)]
print(f"Size of original df: {len(df_clean)}",
      f"Size of kept_rr: {len(kept_rr)}", sep = "\n")

dropped_rr = df_final[df_final[rr_pattern].lt(0.3)]
print(f"\nSize of original df: {len(df_clean)}",
      f"Size of dropped_rr: {len(dropped_rr)}", sep = "\n")

print(f"\nExpected size of dropped_rr: {len(df_clean) - len(kept_rr)}",
      f"Actual: {len(dropped_rr)}", sep = "\n")

NOW TESTING REALRATE FILTERING
---
Size of original df: 35
Size of kept_rr: 35

Size of original df: 35
Size of dropped_rr: 0

Expected size of dropped_rr: 0
Actual: 0


In [35]:
## Testing coverage filtering
print("NOW TESTING COVERAGE FILTERING",
      "---", sep = "\n")
coverage_list = [col for col in kept_rr.columns 
                 if re.search("(A|C|G|T|Deletions)_.*", col)]
kept_rr["TotalCoverage"] = kept_rr[coverage_list].sum(axis = 1)
kept_cov = kept_rr[kept_rr["TotalCoverage"].ge(20)]

print(f"Size of kept_rr: {len(kept_rr)}",
      f"Size of kept_cov: {len(kept_cov)}", sep = "\n")

NOW TESTING COVERAGE FILTERING
---
Size of kept_rr: 35
Size of kept_cov: 35


In [36]:
## Testing DeletionRate filtering
print("NOW TESTING DELETIONRATE FILTERING",
      "---", sep = "\n")
if re.match(fr"(WT|7KO).*", str(folder_name)):
    if re.match(fr"WT.*", str(folder_name)):
        if "_BS" in dr_pattern:
            df_final = kept_cov[kept_cov[dr_pattern].ge(0.8)]
        else: 
            df_final = kept_cov[kept_cov[dr_pattern].le(0.1)]

    if re.match(fr"7KO.*", str(folder_name)):
        if "_BS" in dr_pattern:
            ## THIS LINE SHOULD EXECUTE
            ## Recall: folder_name = 7KO-Cyto-BS_processed_fastqs
            df_final = kept_cov[kept_cov[dr_pattern].le(0.1)]

    df_final = df_final.sort_values(by = dr_pattern, ascending = False)

print(f"Size of original df: {len(df_clean)}",
      f"Size of kept_rr: {len(kept_rr)}",
      f"Size of kept_cov: {len(kept_cov)}",
      f"Size of final filtered df: {len(df_final)}", sep = "\n")

rows_removed = (len(df_clean) - len(df_final))
percent_change = round((rows_removed/len(df_clean)) * 100)
print(f"\nTotal rows removed: {rows_removed}",
      f"The dataset shrunk by approximately {percent_change}% after filtering",
      sep = "\n")

NOW TESTING DELETIONRATE FILTERING
---
Size of original df: 35
Size of kept_rr: 35
Size of kept_cov: 35
Size of final filtered df: 35

Total rows removed: 0
The dataset shrunk by approximately 0% after filtering


### Investigating results of df_final

In [37]:
df_final.head(10)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,fit_b,Rep2_A_BS,Rep2_C_BS,Rep2_G_BS,Rep2_T_BS,Rep2_Deletions_BS,Rep2_DeletionRate_BS,Rep2_RealRate_BS,Rep2_TotalCoverage_BS,TotalCoverage
0,rna-XM_005245453.2,UUUAG,3UTR,NC_000001.11,-,3857,3862,3859,205715358,6396,...,0.001,249,0,0,0,24,0.087912,0.506265,273,273
1,rna-NM_022731.5,UUUAG,3UTR,NC_000001.11,-,3860,3865,3862,205715358,6399,...,0.001,249,0,0,0,24,0.087912,0.506265,273,273
2,rna-NM_012458.4,UAUAG,3UTR,NC_000019.10,-,1250,1255,1252,2426036,1664,...,0.001,8,15,0,0,2,0.08,0.735916,25,25
3,rna-NM_001322267.2,UCUAA,3UTR,NC_000009.12,+,1730,1735,1732,113291699,2939,...,0.001,0,0,0,25,2,0.074074,0.426971,27,27
4,rna-NM_004697.5,UCUAA,3UTR,NC_000009.12,+,1691,1696,1693,113291699,2900,...,0.001,0,0,0,25,2,0.074074,0.426971,27,27
5,rna-NM_001322266.2,UCUAA,3UTR,NC_000009.12,+,1727,1732,1729,113291699,2936,...,0.001,0,0,0,25,2,0.074074,0.426971,27,27
6,rna-NM_001244926.2,UCUAA,3UTR,NC_000009.12,+,1688,1693,1690,113291699,2897,...,0.001,0,0,0,25,2,0.074074,0.426971,27,27
12,rna-XM_017021870.3,UCUAA,3UTR,NC_000015.10,-,4590,4595,4592,52547741,5289,...,0.001,26,0,0,0,2,0.071429,0.417143,28,28
15,rna-NM_001306195.1,UCUAA,3UTR,NC_000015.10,-,4645,4650,4647,52547741,5344,...,0.001,26,0,0,0,2,0.071429,0.417143,28,28
14,rna-NM_006628.6,UCUAA,3UTR,NC_000015.10,-,4667,4672,4669,52547741,5366,...,0.001,26,0,0,0,2,0.071429,0.417143,28,28


In [38]:
original_motifs = df_clean["Motif"].unique()
final_motifs = df_final["Motif"].unique()
set_diff = list(set(original_motifs) - set(final_motifs))

if set_diff:
    print(f"The only UNUAR motif missing in the final dataframe is {set_diff[0]}.")

In [None]:
## For priority_filtered
# df_final = df.drop(columns = constant_cols)

In [None]:
## Find counts for each UNUAR motif
# df["Motif"].groupby()

### Verifying DeletionRate and RealRate

In [39]:
## Testing DeletionRate calculations
print("NOW TESTING DELETIONRATE CALCULATIONS",
      "---", sep = "\n")
first_row = df_final[diff_cols].head(1)
first_row

count_cols = diff_cols[3:8].to_list()
total_sum = int(first_row[count_cols].sum().sum())
manual_sum = 249 + 0 + 0 + 0 + 24
print(f"Expected total sum: {manual_sum}", 
      f"Actual total sum: {total_sum}", sep = "\n")

deletion_col = [col for col in diff_cols if re.search("Deletions.*", col)]
deletion_ct = int(first_row[deletion_col[0]].iloc[0])
deletion_rate = total_sum
print(f"Expected deletion ct: 24",
      f"Actual deletion ct: {deletion_ct}", sep = "\n")

dr_col = [col for col in diff_cols if re.search("DeletionRate.*", col)]
expected_dr = float(deletion_ct/total_sum)
actual_dr = float(first_row[dr_col[0]].iloc[0])
print(f"Expected deletion rate: {expected_dr}",
      f"Actual deletion rate: {actual_dr}", sep = "\n")

if round(expected_dr, 5) == round(actual_dr, 5):
    print("\nDeletion rate is correct!")
else:
    print("\nDeletion rate not calculated correctly")

NOW TESTING DELETIONRATE CALCULATIONS
---
Expected total sum: 273
Actual total sum: 273
Expected deletion ct: 24
Actual deletion ct: 24
Expected deletion rate: 0.08791208791208792
Actual deletion rate: 0.0879120879120879

Deletion rate is correct!


In [40]:
print("NOW TESTING REALRATE CALCULATIONS", 
      "---", sep = "\n")
rr_col = [col for col in diff_cols if re.search("RealRate", col)]
actual_rr = first_row[rr_col[0]].iloc[0]

fit_c = first_row["fit_c"].iloc[0]
fit_s = first_row["fit_s"].iloc[0]
fit_b = first_row["fit_b"].iloc[0]
num = fit_b - expected_dr
denom = fit_c * (fit_b + fit_s
                 - fit_s * expected_dr 
                 - 1)
expected_rr = num/denom

print(f"Expected real rate: {expected_rr}",
      f"Actual real rate: {actual_rr}", sep = "\n")

if round(expected_rr, 5) == round(actual_rr, 5):
    print("\nReal rate is correct!")
else: 
    print("\nReal rate not calculated correctly")

NOW TESTING REALRATE CALCULATIONS
---
Expected real rate: 0.5062647045046491
Actual real rate: 0.5062647045046491

Real rate is correct!
