In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import re

In [None]:
counts_path = '../Data/ATG_lib_data/counts/for_rna' #Path to counts for ATG library variants
x1a_annotations = '../Data/ATG_lib_data/X1A_annotations.xlsx' #Path to file with annotations for X1A variants

In [None]:
def read_counts(path): #Reads all counts
    data_path = Path(counts_path) #Creates path object

    all_files = list(data_path.glob('*tsv')) #Creates list of all count TSV files

    counts_dict = {} #Empty dictionary to collect counts dataframes
    
    for path in all_files: #Iterates through all_files list and reads counts TSV files into dataframes and adds to counts_dict dictionary

        if 'RNA' in str(path): #Reads RNA counts
            df = pd.read_csv(path, sep = '\t')

            if 'RNA_R1R2R3' in str(path):
                counts_dict['RNA_D05_rep1'] = df
            elif 'RNA_R4R5R6' in str(path):
                counts_dict['RNA_D05_rep2'] = df
            elif 'RNA_R7R8R9' in str(path):
                counts_dict['RNA_D05_rep3'] = df
                
        else: #Reads DNA counts
            df = pd.read_csv(path, sep = '\t')
    
            str_path = str(path)
    
            if 'lib_counts' in str_path:
                counts_dict['lib'] = df
            elif 'NC' in str_path:
                counts_dict['NC'] = df
            elif 'R1R2R3_D05' in str_path:
                counts_dict['D05_rep1'] = df
            elif 'R4R5R6_D05' in str_path:
                counts_dict['D05_rep2'] = df
            elif 'R7R8R9_D05' in str_path:
                counts_dict['D05_rep3'] = df
            elif 'R3_D13' in str_path:
                counts_dict['D13_rep1'] = df
            elif 'R6_D13' in str_path:
                counts_dict['D13_rep2'] = df
            elif 'R9_D13' in str_path:
                counts_dict['D13_rep3'] = df

    keys = list(counts_dict.keys())
    
    return counts_dict, keys

In [None]:
def add_freq(dict): #Adds DNA/RNA frequency columns
    
    keys = list(dict.keys()) #Gets all keys from dictionary

    freq_dicts = {} #New dictionary to hold dataframes with frequency columns

    #Iterates through provided dict and adds frequency column
    for key in keys:
        if key != 'NC': #NC skipped
            df = dict[key] #Gets dataframe from dictionary

            total_count = df['Count'].sum() #Gets total number of variants
            df['freq'] = df['Count'] / total_count #Creates frequency column

            freq_dicts[key] = df #Adds to new dictionary

    keys = list(freq_dicts.keys()) #Gets list of keys for future use in downstream functions
    return freq_dicts

In [None]:
def mutate_snvs(dna_sequence): #Mutates all possible SNVs of provided DNA sequence
    snvs = []
    i = 0
    while i < len(dna_sequence):
        if dna_sequence[i] == "A":
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "T":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "C":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        else:
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
        i += 1
    return snvs

In [None]:
def compare_strings(str1, str2, coord_offset): #Special function to compare strings and return position with difference
    
    list_str1 = [] #List for string 1 characters
    list_str2 = [] #List for string 2 charaacters

    #Appends strings 1 and 2 to respective lists
    for char in str1:
        list_str1.append(char)

    for char in str2:
        list_str2.append(char)

    #Iterates and compares each element in str1 and 2 lists. 
    i = 0 
    while i < len(list_str1):
        if list_str1[i] == list_str2[i]:
            i += 1
        else:
            output_str = str(i + coord_offset) + ':' + list_str2[i] #If characters not the same, difference and position returned, coord_offset used for genomic coordinate
            
            return output_str
            
            i += 1

In [None]:
def reverse_complement_string(seq_string): #Reverse complement and returns string
    reverse_seq = seq_string[::-1]
    reverse_comp_list = []
    for char in reverse_seq:
        if char == "A":
            reverse_comp_list.append("T")
        elif char == "G":
            reverse_comp_list.append("C")
        elif char == "C":
            reverse_comp_list.append("G")
        else:
            reverse_comp_list.append("A")
    reverse_compliment_str = "".join(reverse_comp_list)
    return reverse_compliment_str

In [None]:
def annotate_snv_lib(): #Annotates all SNVs in SNV lib with position IDs
    seq = 'CCATGGA' #original WT 7bp library around Met26

    coding_snvs = mutate_snvs(seq) #mutates all possible SNVs
    rev_coding_snvs = {} #Dictionary to hold SNVs that have been reverse complemented (used for pos_id merging)

    for snv in coding_snvs: #Iterates through all SNVs and reverse complements
        rev_coding_snvs[snv] = reverse_complement_string(snv)

    rev_seq = 'TCCATGG'

    mapped_snvs = {}
    for snv in coding_snvs: #iterates through SNVs and generates position IDs for merging
        pos_id = compare_strings(rev_seq, rev_coding_snvs[snv], 214809490)
        mapped_snvs[snv] = pos_id



    return mapped_snvs

In [None]:
def annotate_vars(dict,keys, snv_map, annotations): #Annotates variants with Consequences and AA substitution

    annotation_df = pd.read_excel(annotations) #Reads annotations file
    annotated_dfs = {} #Diciotnary to hold annotated data frames
    
    for key in keys: #Annotates all dataframes
        df = dict[key] #Gets one dataframe
        df['canonical_start'] = df['Sequence'].transform(lambda x: x[15:18]) #Gets canonical start codon
        df['second_start'] = df['Sequence'].transform(lambda x: x[90:93]) #Gets second start codon
        df['snv_lib'] = df['Sequence'].transform(lambda x: x[88:95]) #Adds SNV lib sequence

        df['pos_id'] = df['snv_lib'].transform(lambda x: snv_map[x]) #Adds pos_id
        df['start_pos_id'] = df['canonical_start'] + ':' + df['pos_id'] #Adds combined column with canonical start codon and pos_id
        df = pd.merge(df, annotation_df, how = 'left', on = 'pos_id') #Merges

        df = df[['Count', 'freq', 'canonical_start', 'second_start', 'pos_id', 'pos', 'allele', 'AAsub', 'Consequence','start_pos_id']] #Gets important columns
        df = df.loc[:,['pos', 'allele', 'pos_id', 'Consequence', 'AAsub', 'canonical_start', 'second_start', 'Count', 'start_pos_id', 'freq']] #Reorders columns 

        new_count_name = key + '_count' #Name for replicate count column
        new_freq_name = key + '_freq' #Name for replicate frequency column

        df = df.rename(columns = {'Count': new_count_name, 'freq': new_freq_name}) #Columns for counts and frequency renamed
        
        annotated_dfs[key] = df #Annotated df added to dictionary

    return annotated_dfs

In [None]:
def save_annotated(dfs,keys):

    file_names = {'RNA_D05_rep1': 'BARD1_X1A_RNA_R1R2R3_ATG_SGE049_counts.tsv',
                  'RNA_D05_rep2': 'BARD1_X1A_RNA_R4R5R6_ATG_SGE049_counts.tsv',
                  'RNA_D05_rep3': 'BARD1_X1A_RNA_R7R8R9_ATG_SGE049_counts.tsv',
                  'D05_rep1': 'BARD1_X1A_ATG_SGE049_R1R2R3_D05_counts.tsv',
                  'D05_rep2': 'BARD1_X1A_ATG_SGE049_R4R5R6_D05_counts.tsv',
                  'D05_rep3': 'BARD1_X1A_ATG_SGE049_R7R8R9_D05_counts.tsv'
                 }
    
    for elem in keys:
        if 'RNA' in elem:
            path = '../Data/ATG_lib_data/counts/for_rna/RNA_counts/'

            full_name = path + file_names[elem] 
            df = dfs[elem]
            count_name = elem + '_count'
            df = df.rename(columns = {count_name: 'count'})

            df.to_csv(full_name, sep = '\t', index = False)
            
        else:
            path = '../Data/ATG_lib_data/counts/for_rna/DNA_counts/'

            full_name = path + file_names[elem]

            df = dfs[elem]
            count_name = elem + '_count'
            df = df.rename(columns = {count_name: 'count'})

            df.to_csv(full_name, sep = '\t', index = False)


In [None]:
def main():
    counts_dict, keys = read_counts(counts_path)
    counts_dict = add_freq(counts_dict)
    mapped_snvs = annotate_snv_lib()
    annotated = annotate_vars(counts_dict, keys, mapped_snvs, x1a_annotations)
    save_annotated(annotated, keys)

In [None]:
main()