In [None]:
import pandas as pd
import re
from Bio import SeqIO
from Bio.Seq import Seq
import statistics
import warnings

In [None]:
ref_path = '../Data/SNV_filtering_inputs/PillarProject_filtering/PALB2/20250822_PALB2_PillarProjectRef.xlsx' #path to annotated reference file (you will need to manually give coordinates for where exons are). I originally downloaded the sequence from benchling
sge_scores = '../Data/SNV_filtering_inputs/PillarProject_filtering/PALB2/20250825.PALB2.snvscores.tsv'  #path to SGE datafile
gene = 'PALB2' #name of your gene :)
ref_sense = 0 #Sense of the reference file you provide
filtered_file_name = '../Data/SNV_filtering_inputs/PillarProject_filtering/PALB2/20250825_PALB2snvscores_filtered.xlsx' #name of saved file
targets_input = '../Data/SNV_filtering_inputs/PillarProject_filtering/PALB2/20250825_PALB2_targets.tsv'

In [None]:
def reverse_complement_string(seq_string): #Reverse complement and returns string
    reverse_seq = seq_string[::-1]
    reverse_comp_list = []
    for char in reverse_seq:
        if char == "A":
            reverse_comp_list.append("T")
        elif char == "G":
            reverse_comp_list.append("C")
        elif char == "C":
            reverse_comp_list.append("G")
        else:
            reverse_comp_list.append("A")
    reverse_compliment_str = "".join(reverse_comp_list)
    return reverse_compliment_str

In [None]:
def mutate_snvs(dna_sequence): #Mutates all possible SNVs of provided DNA sequence
    snvs = []
    i = 0
    while i < len(dna_sequence):
        if dna_sequence[i] == "A":
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "T":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "C":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        else:
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
        i += 1
    return snvs

In [None]:
def string_compare(string1, string2):
    if len(string1) != len(string2):
        print(len(string1), len(string2))
        print('String1: ', string1, ' ', 'String2: ', string2)
        raise ValueError('Different Length Strings')

    else:
        i = 0
        while i < len(string1):
            char = string1[i]
            if char == string2[i]:
                i += 1
            else:
                return i, char
                break

In [None]:
def identify_same_codon(numbers):
    pairs = []
    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) == 2:
                pairs.append([numbers[i], numbers[j]])

    return pairs

In [None]:
def read_inputs(reference, sge, targets):
    reference = pd.read_excel(reference)
    reference['Reference'] = reference['Reference'].transform(lambda x: x.upper())
    raw_data = pd.read_csv(sge, sep = '\t')
    targets = pd.read_csv(targets, sep = '\t')

    sge_targets = list(set(raw_data['target'].tolist()))
    print(sge_targets)

    targets = targets.loc[targets['target'].isin(sge_targets)]
    targets = targets.reset_index(drop = True)
    print(targets)
    return reference, raw_data, targets

In [None]:
def get_edits(df):
    
    i = 0
    edits_dict = {}
    while i < len(df):
        target = df['target'][i]
        all_edits = df['required_edits'][i]
        edits_split = all_edits.split(',')
        
        edits_dict[target] = edits_split

        i += 1

    return edits_dict

In [None]:
def filter(edit_dict, data, ref, sense):

    grouped = data.groupby('target')

    to_filter = []
    for target, df in grouped:
        target_ref = ref.loc[ref['target'].isin([target])]
        edits = edit_dict[target]
        edits_split = {int(s[:-1]): s[-1] for s in edits}

        edit_pos = list(edits_split.keys())
        same_codon = identify_same_codon(edit_pos)
        
        if len(same_codon) > 0:
            pos_to_remove = str(statistics.median(same_codon))
            to_remove = [pos_to_remove + ':' + 'A',
                         pos_to_remove + ':' + 'T',
                         pos_to_remove + ':' + 'G',
                         pos_to_remove + ':' + 'C'
                        ]
            print('Doubles: ', pos_to_remove)
        for pos in edit_pos:
            if sense == 0:
                edited_codon_coords = list(range(pos, pos + 3))
                
                print('1 - ', edited_codon_coords)
                
                wt_codon_df = target_ref.loc[target_ref['pos'].isin(edited_codon_coords)]
                wt_codon = wt_codon_df['Reference'].tolist()
                wt_codon = ''.join(wt_codon)
                wt_codon = reverse_complement_string(wt_codon)
                print("WT", wt_codon)

                if len(wt_codon) != 3:
                    warnings.warn('Incomplete Codon')
                    error_string = 'Incomplete Codon at pos ' + str(pos)
                    print(error_string)
                    continue
                    
                canonical_snvs = mutate_snvs(wt_codon)
                possible_aa = []
                
                for elem in canonical_snvs:
                    var = Seq(elem)
                    aa = var.translate()
                    possible_aa.append(str(aa))

                mut_codon = wt_codon[0:2] + reverse_complement_string(edits_split[pos])
                fixed_edit = reverse_complement_string(edits_split[pos])
                mut_codon = wt_codon[0:2]
                mut_snvs = mutate_snvs(mut_codon)

                fixed_edit_aa = []
                impossible = []
                
                for elem in mut_snvs:
                    full_mut_codon = elem + fixed_edit
                    
                    normalized_pos, mut_codon_snv = string_compare(full_mut_codon[0:2], wt_codon[0:2])
                    if normalized_pos == 0:
                        mut_codon_pos = pos + 2
                    elif normalized_pos == 1:
                        mut_codon_pos = pos + 1

                    print('mut codon', full_mut_codon, mut_codon_pos, normalized_pos)
                    mut_codon_pos_id = str(mut_codon_pos) + ':' + reverse_complement_string(mut_codon_snv)

                    var = Seq(full_mut_codon)
                    aa = str(var.translate())
                    fixed_edit_aa.append(aa)
                    if aa in possible_aa:
                        continue
                    else:
                        impossible.append(aa)
                        to_filter.append(mut_codon_pos_id)
                                         
                print('2 - ', target, pos, possible_aa, fixed_edit_aa, impossible)
                print('3 - ', len(impossible), len(to_filter))

            elif sense == 1:
                edited_codon_coords = list(range(pos - 3, pos))
                
                print('1 - ', edited_codon_coords)
                
                wt_codon_df = target_ref.loc[target_ref['pos'].isin(edited_codon_coords)]
                wt_codon = wt_codon_df['Reference'].tolist()
                wt_codon = ''.join(wt_codon)

                canonical_snvs = mutate_snvs(wt_codon)
                possible_aa = []
                
                for elem in canonical_snvs:
                    var = Seq(elem)
                    aa = var.translate()
                    possible_aa.append(str(aa))

                mut_codon = wt_codon[0:2] + (edits_split[pos])
                fixed_edit = edits_split[pos]
                mut_codon = wt_codon[0:2]
                mut_snvs = mutate_snvs(mut_codon)

                fixed_edit_aa = []
                impossible = []
                
                for elem in mut_snvs:
                    full_mut_codon = elem + fixed_edit
                    normalized_pos, mut_codon_snv = string_compare(full_mut_codon[0:2], wt_codon[0:2])
                    if normalized_pos == 0:
                        mut_codon_pos = pos - 2
                    elif normalized_pos == 1:
                        mut_codon_pos = pos -1
                        
                    print('mut codon', full_mut_codon, mut_codon_pos, normalized_pos)
                    mut_codon_pos_id = str(mut_codon_pos) + ':' + mut_codon_snv

                    var = Seq(full_mut_codon)
                    aa = str(var.translate())
                    fixed_edit_aa.append(aa)
                    if aa in possible_aa:
                        continue
                    else:
                        impossible.append(aa)
                        to_filter.append(mut_codon_pos_id)
                                         
                print('2 - ', target, pos, possible_aa, fixed_edit_aa, impossible)
                print('3 - ', len(impossible), len(to_filter))
    

    return to_filter

In [None]:
def filter_data(sge, to_filter):

    sge['pos'] = sge['pos'].astype(str)
    sge['pos_id'] = sge['pos'] + ':' + sge['alt']
    sge['pos'] = sge['pos'].astype(int)

    filtered = sge.loc[~((sge['pos_id'].isin(to_filter)) & (sge['amino_acid_change'] != '---'))]

    return filtered

In [None]:
def main():
    ref, raw_data, targets = read_inputs(ref_path, sge_scores, targets_input)
    all_edits = get_edits(targets)
    to_filter = filter(all_edits, raw_data, ref, ref_sense)
    filtered_data = filter_data(raw_data, to_filter)


    filtered_data.to_excel(filtered_file_name, index = False)
    print(to_filter)
    print('Filtering statistics: ', '\n',
            'RAW DATA: ', len(raw_data), '\n',
           'VARS. TO FILTER: ', len(to_filter), '\n',
            'POST FILTER: ', len(filtered_data), '\n'
         )

In [None]:
main()