In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import altair as alt

In [None]:
counts_path = '../Data/ATG_lib_data/counts'

In [None]:
def read_counts(path):
    data_path = Path(counts_path)

    all_files = list(data_path.glob('*tsv'))

    counts_dict = {}

    for path in all_files:
        df = pd.read_csv(path, sep = '\t')

        str_path = str(path)

        if 'lib_counts' in str_path:
            counts_dict['lib'] = df
        elif 'NC' in str_path:
            counts_dict['NC'] = df
        elif 'R1R2R3_D05' in str_path:
            counts_dict['D05_rep1'] = df
        elif 'R4R5R6_D05' in str_path:
            counts_dict['D05_rep2'] = df
        elif 'R7R8R9_D05' in str_path:
            counts_dict['D05_rep3'] = df
        elif 'R3_D13' in str_path:
            counts_dict['D13_rep1'] = df
        elif 'R6_D13' in str_path:
            counts_dict['D13_rep2'] = df
        elif 'R9_D13' in str_path:
            counts_dict['D13_rep3'] = df

    return counts_dict

In [None]:
def add_freq(dict):
    keys = list(dict.keys())

    freq_dicts = {}
    for key in keys:
        if key != 'NC':
            df = dict[key]

            total_count = df['Count'].sum()
            df['freq'] = df['Count'] / total_count

            freq_dicts[key] = df

    keys = list(freq_dicts.keys())
    return freq_dicts,keys

In [None]:
def mutate_snvs(dna_sequence): #Mutates all possible SNVs of provided DNA sequence
    snvs = []
    i = 0
    while i < len(dna_sequence):
        if dna_sequence[i] == "A":
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "T":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "C":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        else:
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
        i += 1
    return snvs

In [None]:
def reverse_complement_string(seq_string): #Reverse complement and returns string
    reverse_seq = seq_string[::-1]
    reverse_comp_list = []
    for char in reverse_seq:
        if char == "A":
            reverse_comp_list.append("T")
        elif char == "G":
            reverse_comp_list.append("C")
        elif char == "C":
            reverse_comp_list.append("G")
        else:
            reverse_comp_list.append("A")
    reverse_compliment_str = "".join(reverse_comp_list)
    return reverse_compliment_str

In [None]:
def compare_strings(str1, str2, coord_offset):
    list_str1 = []
    list_str2 = []

    for char in str1:
        list_str1.append(char)

    for char in str2:
        list_str2.append(char)

    i = 0 
    while i < len(list_str1):
        if list_str1[i] == list_str2[i]:
            i += 1
        else:
            output_str = str(i + coord_offset) + ':' + list_str2[i]
            
            return output_str
            
            i += 1

In [None]:
def annotate_snv_lib():
    seq = 'CCATGGA'

    coding_snvs = mutate_snvs(seq)
    rev_coding_snvs = {}

    for snv in coding_snvs:
        rev_coding_snvs[snv] = reverse_complement_string(snv)

    rev_seq = 'TCCATGG'

    mapped_snvs = {}
    for snv in coding_snvs:
        pos_id = compare_strings(rev_seq, rev_coding_snvs[snv], 214809490)
        mapped_snvs[snv] = pos_id

    print(mapped_snvs)

    return mapped_snvs
    

In [None]:
def annotate_vars(dict,keys, snv_map):
    
    for key in keys:
        df = dict[key]
        df['canonical_start'] = df['Sequence'].transform(lambda x: x[15:18])
        df['second_start'] = df['Sequence'].transform(lambda x: x[90:93])
        df['snv_lib'] = df['Sequence'].transform(lambda x: x[88:95])

        df['pos_id'] = df['snv_lib'].transform(lambda x: snv_map[x])
        print(df)

        
        

In [None]:
def main():
    counts_dict = read_counts(counts_path)
    freq_dicts, keys = add_freq(counts_dict)
    mapped_snvs = annotate_snv_lib()
    annotate_vars(freq_dicts, keys, mapped_snvs)


In [None]:
main()