In [1]:
from pathlib import Path
import traceback
import pandas as pd
import numpy as np
import re

## Disable .loc indexing warning
pd.options.mode.chained_assignment = None

### Loading in data

In [2]:
## Load in file
df = pd.read_csv("C:/Users/Sonia Ling/Desktop/calculate_dr/KEH-Rep2-7KO-HEK293T-Cyto-BS.sorted.tsv", sep = "\t")

df.head(5)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,fit_c,fit_s,fit_b,A,C,G,T,Deletions,DeletionRate,RealRate
0,rna-NM_021254.4,UGUAA,3UTR,NC_000021.9,-,1102,1107,1104,32601765,3516,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
1,rna-NM_001350335.2,UGUAA,3UTR,NC_000021.9,-,2193,2198,2195,32601765,2294,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
2,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
3,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
4,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417


In [3]:
## Columns that are always the same across all TSVs
constant_cols = df.columns.tolist()[0:17]
constant_cols

['TranscriptID',
 'Motif',
 'Region',
 'Chrom',
 'Strand',
 'TranscriptPosStart',
 'TranscriptPosEnd',
 'TranscriptModBase',
 'GenomicModBase',
 'TranscriptLength',
 'DistFromAUG',
 'DistFromSTOP',
 'DistFromExonStart',
 'DistFromExonEnd',
 'fit_c',
 'fit_s',
 'fit_b']

In [4]:
df_clean = df.dropna()

In [5]:
subfolder = "KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned"
folder_name = "7KO-Cyto-BS_processed_fastqs"
print(subfolder, folder_name, sep = "\n")

KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned
7KO-Cyto-BS_processed_fastqs


### Testing make_key()

In [6]:
## Function definition
def make_key(subfolder, base_key):
    """
    PURPOSE:
    Modifies names of dictionary keys based on the Rep # (detected via RegEx)
    and Sample Type (BS, NBS) in a given subfolder name.
    ---
    NOTES:
    * sorted(set(rep_matches)): Removes duplicate reps, sorts in ascending order
    * for rep in rep_list: Adds replicate prefix to dict key names
    * for sample in ['BS', 'NBS']: Adds sample type suffix to dict key names
    """
    rep_matches = re.findall(r"Rep\d+", str(subfolder))
    rep_list = sorted(set(rep_matches))
    
    for rep in rep_list:
        if f"-{rep}-" in str(subfolder):
            prefix = rep + "_"
            break
    
    for sample in ["BS", "NBS"]:
        if f"-{sample}_" in str(subfolder):
            suffix = "_" + sample
            break
    
    return prefix + base_key + suffix

In [7]:
"""
PURPOSE:
* Showcases how rep_list is supposed to bheave
* Reps should be listed in integer order
"""

## Test 1: Rep1, Rep2, Rep3
folder_list = ["KEH-Rep3-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep2-7KO-HEK293T-Cyto-BS_star_realigned"]
rep_matches = re.findall(r"Rep\d+", str(folder_list))
rep_list = sorted(set(rep_matches), key = lambda x: int(x[3:]))

print("NOW TESTING REP_LIST")
print("Expected: ['Rep1', 'Rep2', 'Rep3'], Actual: ", rep_list)


## Test 2: Rep1, Rep5, Rep10
folder_list = ["KEH-Rep10-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep5-7KO-HEK293T-Cyto-BS_star_realigned"]
rep_matches = re.findall(r"Rep\d+", str(folder_list))
rep_list = sorted(set(rep_matches), key = lambda x: int(x[3:]))

print("NOW TESTING REP_LIST")
print("Expected: ['Rep1', 'Rep5', 'Rep10'], Actual: ", rep_list)

NOW TESTING REP_LIST
Expected: ['Rep1', 'Rep2', 'Rep3'], Actual:  ['Rep1', 'Rep2', 'Rep3']
NOW TESTING REP_LIST
Expected: ['Rep1', 'Rep5', 'Rep10'], Actual:  ['Rep1', 'Rep5', 'Rep10']


In [8]:
key = {base_key: make_key(subfolder, base_key) for base_key 
       in ["A", "C", "G", "T", "Deletions", "DeletionRate", "RealRate"]}

print("NOW TESTING KEY VARIABLE")
key

NOW TESTING KEY VARIABLE


{'A': 'Rep1_A_BS',
 'C': 'Rep1_C_BS',
 'G': 'Rep1_G_BS',
 'T': 'Rep1_T_BS',
 'Deletions': 'Rep1_Deletions_BS',
 'DeletionRate': 'Rep1_DeletionRate_BS',
 'RealRate': 'Rep1_RealRate_BS'}

In [9]:
## Dynamically rename columns in df_clean
df_final = df_clean.rename(columns = {"A": key["A"], 
                                      "C": key["C"], 
                                      "G": key["G"], 
                                      "T": key["T"], 
                                      "Deletions": key["Deletions"], 
                                      "DeletionRate": key["DeletionRate"], 
                                      "RealRate": key["RealRate"]})

df_final.head(5)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,fit_c,fit_s,fit_b,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,Rep1_Deletions_BS,Rep1_DeletionRate_BS,Rep1_RealRate_BS
0,rna-NM_021254.4,UGUAA,3UTR,NC_000021.9,-,1102,1107,1104,32601765,3516,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
1,rna-NM_001350335.2,UGUAA,3UTR,NC_000021.9,-,2193,2198,2195,32601765,2294,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
2,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
3,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
4,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417


### Testing get_sample_group()

In [10]:
## Function definition
def get_sample_group(folder_name):
    """
    PURPOSE:
    Given input folder names, extract the group name
    by returning the first capture group in RegEx.
    ---
    EXAMPLE: 
    '7KO-Cyto-BS_processed_fastqs' -> '7KO-Cyto'
    """
    match = re.match(r"(.+)-(?:BS|NBS)_processed_fastqs", folder_name)
    return match.group(1)

In [11]:
print("FOLDER NAME:", folder_name)
print("Expected: 7KO-Cyto, Actual:", get_sample_group(folder_name))
print("Expected: 7KO-Cyto, Actual:", get_sample_group("7KO-Cyto-NBS_processed_fastqs"))
print("Expected: WT-Nuc, Actual:", get_sample_group("WT-Nuc-BS_processed_fastqs"))
print("Expected: WT-Nuc, Actual:", get_sample_group("WT-Nuc-NBS_processed_fastqs"))

FOLDER NAME: 7KO-Cyto-BS_processed_fastqs
Expected: 7KO-Cyto, Actual: 7KO-Cyto
Expected: 7KO-Cyto, Actual: 7KO-Cyto
Expected: WT-Nuc, Actual: WT-Nuc
Expected: WT-Nuc, Actual: WT-Nuc


### Testing filtering & cutoffs

In [12]:
## Filter by DeletionRate based on keywords in 
## folder_name and DeletionRate column (accessed via key)
"""
WT:
* BS files must have DeletionRate values of >= 0.8
* NBS files must have DeletionRate values of <= 0.1
---
Mutation (PUS7KO):
* BS files must have DeletionRate values of <= 0.1
"""
dr_pattern = key["DeletionRate"]
print("Expected: Rep1_DeletionRate_BS, Actual:", dr_pattern)

rr_pattern = key["RealRate"]
print("Expected: Rep1_RealRate_BS, Actual:", rr_pattern)

Expected: Rep1_DeletionRate_BS, Actual: Rep1_DeletionRate_BS
Expected: Rep1_RealRate_BS, Actual: Rep1_RealRate_BS


In [13]:
## Testing RealRate filtering
kept_rr = df_final[df_final[rr_pattern].ge(0.3)]
print(f"Size of original df: {len(df_clean)}",
      f"Size of kept_rr: {len(kept_rr)}", sep = "\n")

dropped_rr = df_final[df_final[rr_pattern].lt(0.3)]
print(f"\nSize of original df: {len(df_clean)}",
      f"Size of dropped_rr: {len(dropped_rr)}", sep = "\n")

print(f"\nExpected size of dropped_rr: {len(df_clean) - len(kept_rr)}",
      f"Actual: {len(dropped_rr)}", sep = "\n")

Size of original df: 79249
Size of kept_rr: 70129

Size of original df: 79249
Size of dropped_rr: 9120

Expected size of dropped_rr: 9120
Actual: 9120


In [14]:
## Testing coverage filtering
coverage_list = [col for col in kept_rr.columns 
                 if re.search("(A|C|G|T|Deletions)_.*", col)]
kept_rr["TotalCoverage"] = kept_rr[coverage_list].sum(axis = 1)
kept_cov = kept_rr[kept_rr["TotalCoverage"].ge(20)]

print(f"Size of kept_rr: {len(kept_rr)}",
      f"Size of kept_cov: {len(kept_cov)}", sep = "\n")

Size of kept_rr: 70129
Size of kept_cov: 63439


In [15]:
## Testing DeletionRate filtering
if re.match(fr"(WT|7KO).*", str(folder_name)):
    if re.match(fr"WT.*", str(folder_name)):
        if "_BS" in dr_pattern:
            df_final = kept_cov[kept_cov[dr_pattern].ge(0.8)]
        else: 
            df_final = kept_cov[kept_cov[dr_pattern].le(0.1)]

    if re.match(fr"7KO.*", str(folder_name)):
        if "_BS" in dr_pattern:
            ## THIS LINE SHOULD EXECUTE
            ## folder_name = 7KO-Cyto-BS_processed_fastqs
            df_final = kept_cov[kept_cov[dr_pattern].le(0.1)]

    df_final = df_final.sort_values(by = dr_pattern, ascending = False)

print(f"Size of original df: {len(df_clean)}",
      f"Size of kept_rr: {len(kept_rr)}",
      f"Size of kept_cov: {len(kept_cov)}",
      f"Size of final filtered df: {len(df_final)}", sep = "\n")

rows_removed = (len(df_clean) - len(df_final))
percent_change = round((rows_removed/len(df_clean)) * 100)
print(f"\nTotal rows removed: {rows_removed}",
      f"The dataset shrunk by approximately {percent_change}% after filtering",
      sep = "\n")

Size of original df: 79249
Size of kept_rr: 70129
Size of kept_cov: 63439
Size of final filtered df: 54052

Total rows removed: 25197
The dataset shrunk by approximately 32% after filtering


### Investigating results of df_final

In [16]:
df_final.head(10)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,fit_s,fit_b,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,Rep1_Deletions_BS,Rep1_DeletionRate_BS,Rep1_RealRate_BS,TotalCoverage
16319,rna-XM_005245453.2,UUUAG,3UTR,NC_000001.11,-,3857,3862,3859,205715358,6396,...,0.901845,0.001,249.0,0.0,0.0,0.0,24.0,0.087912,0.506265,273.0
16318,rna-XM_005245453.2,UUUAG,3UTR,NC_000001.11,-,3857,3862,3859,205715358,6396,...,0.901845,0.001,249.0,0.0,0.0,0.0,24.0,0.087912,0.506265,273.0
16320,rna-NM_022731.5,UUUAG,3UTR,NC_000001.11,-,3860,3865,3862,205715358,6399,...,0.901845,0.001,249.0,0.0,0.0,0.0,24.0,0.087912,0.506265,273.0
16321,rna-NM_022731.5,UUUAG,3UTR,NC_000001.11,-,3860,3865,3862,205715358,6399,...,0.901845,0.001,249.0,0.0,0.0,0.0,24.0,0.087912,0.506265,273.0
15814,rna-NM_012458.4,UAUAG,3UTR,NC_000019.10,-,1250,1255,1252,2426036,1664,...,0.966038,0.001,8.0,15.0,0.0,0.0,2.0,0.08,0.735916,25.0
16330,rna-NM_001244926.2,UCUAA,3UTR,NC_000009.12,+,1688,1693,1690,113291699,2897,...,0.890419,0.001,0.0,0.0,0.0,25.0,2.0,0.074074,0.426971,27.0
16331,rna-NM_001244926.2,UCUAA,3UTR,NC_000009.12,+,1688,1693,1690,113291699,2897,...,0.890419,0.001,0.0,0.0,0.0,25.0,2.0,0.074074,0.426971,27.0
16338,rna-NM_004697.5,UCUAA,3UTR,NC_000009.12,+,1691,1696,1693,113291699,2900,...,0.890419,0.001,0.0,0.0,0.0,25.0,2.0,0.074074,0.426971,27.0
16337,rna-NM_004697.5,UCUAA,3UTR,NC_000009.12,+,1691,1696,1693,113291699,2900,...,0.890419,0.001,0.0,0.0,0.0,25.0,2.0,0.074074,0.426971,27.0
16336,rna-NM_004697.5,UCUAA,3UTR,NC_000009.12,+,1691,1696,1693,113291699,2900,...,0.890419,0.001,0.0,0.0,0.0,25.0,2.0,0.074074,0.426971,27.0


In [31]:
original_motifs = df_clean["Motif"].unique()
final_motifs = df_final["Motif"].unique()
set_diff = list(set(original_motifs) - set(final_motifs))

print(f"The only UNUAR motif missing in the final dataframe is {set_diff[0]}.")

The only UNUAR motif missing in the final dataframe is UGUAA.


In [18]:
## For priority_filtered
# df_final = df.drop(columns = constant_cols)

In [None]:
## Find counts for each UNUAR motif
# df["Motif"].groupby()

array(['UGUAA', 'UUUAA', 'UUUAG', 'UAUAG', 'UAUAA', 'UCUAG', 'UCUAA',
       'UGUAG'], dtype=object)