In [2]:
import pandas as pd
import numpy as np
import re

### Loading in data

In [3]:
## Load in file
df = pd.read_csv("C:/Users/Sonia Ling/Desktop/calculate_dr/KEH-Rep2-7KO-HEK293T-Cyto-BS.sorted.tsv", sep = "\t")

df.head(5)

Unnamed: 0,TranscriptID,Motif,Region,Chrom,Strand,TranscriptPosStart,TranscriptPosEnd,TranscriptModBase,GenomicModBase,TranscriptLength,...,fit_c,fit_s,fit_b,A,C,G,T,Deletions,DeletionRate,RealRate
0,rna-NM_021254.4,UGUAA,3UTR,NC_000021.9,-,1102,1107,1104,32601765,3516,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
1,rna-NM_001350335.2,UGUAA,3UTR,NC_000021.9,-,2193,2198,2195,32601765,2294,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
2,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
3,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
4,rna-NM_001350337.2,UGUAA,3UTR,NC_000021.9,-,1400,1405,1402,32601765,1501,...,0.944807,0.779573,0.001,0.0,0.0,0.0,0.0,2.0,1.0,1.058417


In [4]:
## Columns that are always the same across all TSVs
constant_cols = df.columns.tolist()[0:17]
constant_cols

['TranscriptID',
 'Motif',
 'Region',
 'Chrom',
 'Strand',
 'TranscriptPosStart',
 'TranscriptPosEnd',
 'TranscriptModBase',
 'GenomicModBase',
 'TranscriptLength',
 'DistFromAUG',
 'DistFromSTOP',
 'DistFromExonStart',
 'DistFromExonEnd',
 'fit_c',
 'fit_s',
 'fit_b']

In [None]:
df_clean = df.drop(columns = constant_cols)

In [14]:
subfolder = "KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned"
folder_name = "7KO-Cyto-BS_processed_fastqs"
print(subfolder, folder_name, sep = "\n")

KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned
7KO-Cyto-BS_processed_fastqs


### Testing make_key()

In [7]:
## Function definition
def make_key(subfolder, base_key):
    """
    PURPOSE:
    Modifies names of dictionary keys based on the Rep # (detected via RegEx)
    and Sample Type (BS, NBS) in a given subfolder name.
    ---
    NOTES:
    * sorted(set(rep_matches)): Removes duplicate reps, sorts in ascending order
    * for rep in rep_list: Adds replicate prefix to dict key names
    * for sample in ['BS', 'NBS']: Adds sample type suffix to dict key names
    """
    rep_matches = re.findall(r"Rep\d+", str(subfolder))
    rep_list = sorted(set(rep_matches))
    
    for rep in rep_list:
        if f"-{rep}-" in str(subfolder):
            prefix = rep + "_"
            break
    
    for sample in ["BS", "NBS"]:
        if f"-{sample}_" in str(subfolder):
            suffix = "_" + sample
            break
    
    return prefix + base_key + suffix

In [15]:
## Showcasing how rep_list is supposed to behave
## Reps should be listed in integer order
folder_list = ["KEH-Rep3-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep1-7KO-HEK293T-Cyto-BS_star_realigned",
               "KEH-Rep2-7KO-HEK293T-Cyto-BS_star_realigned"]
rep_matches = re.findall(r"Rep\d+", str(folder_list))
rep_list = sorted(set(rep_matches))

print("NOW TESTING REP_LIST")
print("Expected: ['Rep1', 'Rep2', 'Rep3'], Actual: ", rep_list)

NOW TESTING REP_LIST
Expected: ['Rep1', 'Rep2', 'Rep3'], Actual:  ['Rep1', 'Rep2', 'Rep3']


In [16]:
key = {base_key: make_key(subfolder, base_key) for base_key 
       in ["A", "C", "G", "T", "Deletions", "DeletionRate", "RealRate"]}

print("NOW TESTING KEY VARIABLE")
key

NOW TESTING KEY VARIABLE


{'A': 'Rep1_A_BS',
 'C': 'Rep1_C_BS',
 'G': 'Rep1_G_BS',
 'T': 'Rep1_T_BS',
 'Deletions': 'Rep1_Deletions_BS',
 'DeletionRate': 'Rep1_DeletionRate_BS',
 'RealRate': 'Rep1_RealRate_BS'}

In [22]:
## Dynamically rename columns in df_clean
df_final = df_clean.rename(columns = {"A": key["A"], 
                                      "C": key["C"], 
                                      "G": key["G"], 
                                      "T": key["T"], 
                                      "Deletions": key["Deletions"], 
                                      "DeletionRate": key["DeletionRate"], 
                                      "RealRate": key["RealRate"]})

df_final.head(5)

Unnamed: 0,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,Rep1_Deletions_BS,Rep1_DeletionRate_BS,Rep1_RealRate_BS
0,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
1,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
2,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
3,0.0,0.0,0.0,0.0,2.0,1.0,1.058417
4,0.0,0.0,0.0,0.0,2.0,1.0,1.058417


### Testing DeletionRate filtering

In [40]:
## Filter by DeletionRate based on keywords in 
## folder_name and DeletionRate column (accessed via key)
"""
WT:
* BS files must have DeletionRate values of >= 0.8
* NBS files must have DeletionRate values of <= 0.1
---
Mutation (PUS7KO):
* BS files must have DeletionRate values of <= 0.1
"""
dr_pattern = key["DeletionRate"]
print("Expected: Rep1_DeletionRate_BS, Actual:", dr_pattern)

if re.match(fr"(WT|7KO).*", str(folder_name)):
    if re.match(fr"WT.*", str(folder_name)):
        if "_BS" in dr_pattern:
            df_final = df_final[df_final[dr_pattern].ge(0.8)]
        else: 
            df_final = df_final[df_final[dr_pattern].le(0.1)]

    if re.match(fr"7KO.*", str(folder_name)):
        if "_BS" in dr_pattern:
            df_final = df_final[df_final[dr_pattern].le(0.1)]

print(f"Size of original df: {df_clean.shape[0]}",
      f"Size of filtered df: {df_final.shape[0]}", sep = "\n")

Expected: Rep1_DeletionRate_BS, Actual: Rep1_DeletionRate_BS
Size of original df: 79249
Size of filtered df: 63189


### Grouping outputs by UNUAR motif

In [46]:
## Find counts for each UNUAR motif
df["Motif"].unique()
# df["Motif"].groupby()

array(['UGUAA', 'UUUAA', 'UUUAG', 'UAUAG', 'UAUAA', 'UCUAG', 'UCUAA',
       'UGUAG'], dtype=object)