In [None]:
import pandas as pd
import re
from Bio import SeqIO
from Bio.Seq import Seq
import statistics
import warnings

In [None]:
ref_path = '../Data/SNV_filtering_inputs/PillarProject_filtering/XRCC2/20250822_XRCC2_PillarProjectRef.xlsx' #path to annotated (non-coding/coding) reference for each library
sge_scores = '../Data/SNV_filtering_inputs/PillarProject_filtering/XRCC2/20250825.XRCC2.snvscores.tsv'  #path to SGE datafile
gene = 'XRCC2' #name of your gene
ref_sense = 0 #Sense of the reference file you provide
filtered_file_name = '../Data/SNV_filtering_inputs/PillarProject_filtering/XRCC2/20250825_XRCC2snvscores_filtered.xlsx' #name of saved file
targets_input = '../Data/SNV_filtering_inputs/PillarProject_filtering/XRCC2/202507_XRCC2_targets.tsv' #Targets input contains positions and base changed for all fixed edits on the positive sense strand

In [None]:
def reverse_complement_string(seq_string): #Reverse complement and returns string
    reverse_seq = seq_string[::-1]
    reverse_comp_list = []
    for char in reverse_seq:
        if char == "A":
            reverse_comp_list.append("T")
        elif char == "G":
            reverse_comp_list.append("C")
        elif char == "C":
            reverse_comp_list.append("G")
        else:
            reverse_comp_list.append("A")
    reverse_compliment_str = "".join(reverse_comp_list)
    return reverse_compliment_str

In [None]:
def mutate_snvs(dna_sequence): #Mutates all possible SNVs of provided DNA sequence
    snvs = []
    i = 0
    while i < len(dna_sequence):
        if dna_sequence[i] == "A":
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "T":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        elif dna_sequence[i] == "C":
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "G" + dna_sequence[i + 1 :])
        else:
            snvs.append(dna_sequence[:i] + "A" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "T" + dna_sequence[i + 1 :])
            snvs.append(dna_sequence[:i] + "C" + dna_sequence[i + 1 :])
        i += 1
    return snvs

In [None]:
def string_compare(string1, string2): #Function to compare strings and return the position and character that is different
    
    if len(string1) != len(string2): #Function expects strings of same length
        print(len(string1), len(string2))
        print('String1: ', string1, ' ', 'String2: ', string2)
        raise ValueError('Different Length Strings')

    else: #Checking all string indices
        i = 0
        while i < len(string1):
            char = string1[i]
            if char == string2[i]: #Next interation if characters are the same
                i += 1
            else: #Returns index and character if not the same
                return i, char
                break

In [None]:
def identify_same_codon(numbers): #Filtering function to identify if any fixed edits are within the same codon
    
    pairs = [] #List to hold pairs of positions
    
    for i in range(len(numbers)): #Iterates through positions of all fixed edits
        for j in range(i + 1, len(numbers)): #Loop to extract positions from list
            
            if abs(numbers[i] - numbers[j]) == 2: #Fixed edits within the same codon will have distance of 2
                pairs.append([numbers[i], numbers[j]]) #Appended to list to be returned

    return pairs

In [None]:
def read_inputs(reference, sge, targets): #Reads all inputs
    
    reference = pd.read_excel(reference) #Reads provided reference file
    reference['Reference'] = reference['Reference'].transform(lambda x: x.upper()) #Turns all bases in reference to upper case
    raw_data = pd.read_csv(sge, sep = '\t') #Reads raw data
    targets = pd.read_csv(targets, sep = '\t') #Reads targets file

    sge_targets = list(set(raw_data['target'].tolist())) #Gets all SGE targets
    print(sge_targets)

    targets = targets.loc[targets['target'].isin(sge_targets)] #Filters targets input to include SGE targets in data only
    targets = targets.reset_index(drop = True) #Resets index
    
    print(targets)
    return reference, raw_data, targets

In [None]:
def get_edits(df): #Gets all fixed edits from targets.tsv
    
    i = 0
    edits_dict = {} #Dictionary to hold edits
    while i < len(df): #iterates through each row of targets dataframe
        target = df['target'][i] #Gets target
        all_edits = df['required_edits'][i] #Gets all edit
        edits_split = all_edits.split(',') #Splits all edits
        
        edits_dict[target] = edits_split #Appended to dictionary

        i += 1

    return edits_dict

In [None]:
def filter(edit_dict, data, ref, sense): #Filters out amino acid changes possible only in context of required fixed edits

    grouped = data.groupby('target') #Groups by target

    to_filter = [] #List to hold variants to remove
    
    for target, df in grouped: #Iterates through data for each target
        target_ref = ref.loc[ref['target'].isin([target])] #Gets reference oligo for target
        edits = edit_dict[target] #Gets all edits
        edits_split = {int(s[:-1]): s[-1] for s in edits} #Splits edits into dictionary that is {position: edit}

        edit_pos = list(edits_split.keys()) #Gets all edit positions
        same_codon = identify_same_codon(edit_pos) #Runs function to identify fixed edits in the same codon
        
        if len(same_codon) > 0: #Tests if any same codon edits are detected
            same_codon = same_codon[0] #Gets positions that have fixed edits in same codon
            pos_to_remove = int(statistics.median(same_codon)) #Gets basepair in middle that is not fixed edit
            
            pos_to_remove = str(pos_to_remove) #Sets coordinate to remove to string
            
            to_remove = [pos_to_remove + ':' + 'A',
                         pos_to_remove + ':' + 'T',
                         pos_to_remove + ':' + 'G',
                         pos_to_remove + ':' + 'C'
                        ] #List of variants to remove in the middle basepair

            for var in to_remove: #Appends all variants to filtering list
                target_pos_id = target + ':' + var
                to_filter.append(target_pos_id)
                
            print('Doubles: ', pos_to_remove) #Prints position removed due to two fixed edits in same codon
            
        for pos in edit_pos: #Iterates through each fixed edit and determines variants to remove
            if sense == 0: #For antisense genes
                edited_codon_coords = list(range(pos, pos + 3)) #Gets coordinates of the edited codon 
                
                print('1 - Edited Codon ', edited_codon_coords) #Prints edited codon coords (should decrease to fixed edit for antisense gene)
                
                wt_codon_df = target_ref.loc[target_ref['pos'].isin(edited_codon_coords)] #Gets WT codon from reference in df form
                wt_codon = wt_codon_df['Reference'].tolist() #WT reference in list form
                wt_codon = ''.join(wt_codon) #WT reference in string form
                wt_codon = reverse_complement_string(wt_codon) #Reverse complementation for antisense gene
                print("2 - WT ", wt_codon) #Prints WT codon

                if len(wt_codon) != 3: #Error checking for fixed edits that are at very edges of libraries
                    warnings.warn('Incomplete Codon')
                    error_string = 'Incomplete Codon at pos ' + str(pos)
                    print(error_string)
                    continue
                    
                canonical_snvs = mutate_snvs(wt_codon) #Gets all possible SNVs with the WT codon
                possible_aa = [] #List to hold possible alternate amino acids with the canonical SNVs
                
                for elem in canonical_snvs: #Generates the possible alternate amino acids based on WT codon
                    var = Seq(elem)
                    aa = var.translate()
                    possible_aa.append(str(aa))

                mut_codon = wt_codon[0:2] + reverse_complement_string(edits_split[pos]) #Gets codon with fixed edit (first two positions are same, 3rd position is fixed edit)
                fixed_edit = reverse_complement_string(edits_split[pos]) #Reverse complementation for antisense gene
                mut_codon = wt_codon[0:2] #Gets bases not including fixed edit
                mut_snvs = mutate_snvs(mut_codon) #mutates all possible SNVs for those 2 bases

                fixed_edit_aa = [] #List to hold amino acid changes with the fixed edit
                impossible = [] #List to hold missense changes possible only in context of the fixed edit
                
                for elem in mut_snvs: #Iterates and translates SNVs with fixed edit
                    full_mut_codon = elem + fixed_edit #Buils full codon with fixed edit
                    
                    normalized_pos, mut_codon_snv = string_compare(full_mut_codon[0:2], wt_codon[0:2]) #String compare to get position that is different from WT (only first 2 bases checked)
                    if normalized_pos == 0: #Gets position if first bp is different (2bp upstream of edit)
                        mut_codon_pos = pos + 2 
                    elif normalized_pos == 1: #Gets position if 2nd bp is different (1bp upstream of edit)
                        mut_codon_pos = pos + 1

                    print('mut codon', full_mut_codon, mut_codon_pos, normalized_pos) #Prints full codon with fixed edit, and positions
                    mut_codon_pos_id = target + ':' + str(mut_codon_pos) + ':' + reverse_complement_string(mut_codon_snv) #Builds unique identifier for filtering

                    #Variants translated
                    var = Seq(full_mut_codon)
                    aa = str(var.translate())
                    fixed_edit_aa.append(aa)
                    
                    if aa in possible_aa: #Checks if amino acid change is possible with WT SNVs
                        continue
                    else:
                        impossible.append(aa) 
                        to_filter.append(mut_codon_pos_id) #Identifier appended to filter list if not in list of amino acid changes possible with WT SNVs
                                         
                print('3 - ', target, pos, possible_aa, fixed_edit_aa, impossible) #Prints the changes
                print('4 - ', len(impossible), len(to_filter)) #Running counter for variants to filter

            elif sense == 1: #Analogous code for genes where coding strand is sense strand. Reverse complementation removed. Math to determine positions of codons changed
                edited_codon_coords = list(range(pos - 2, pos + 1))
                
                print('1 - Edited Codon ', edited_codon_coords)
                
                wt_codon_df = target_ref.loc[target_ref['pos'].isin(edited_codon_coords)]
                wt_codon = wt_codon_df['Reference'].tolist()
                wt_codon = ''.join(wt_codon)
                print('2 - WT ', wt_codon)
                if len(wt_codon) != 3:
                    warnings.warn('Incomplete Codon')
                    error_string = 'Incomplete Codon at pos ' + str(pos)
                    print(error_string)
                    continue

                canonical_snvs = mutate_snvs(wt_codon)
                possible_aa = []
                
                for elem in canonical_snvs:
                    var = Seq(elem)
                    aa = var.translate()
                    possible_aa.append(str(aa))

                mut_codon = wt_codon[0:2] + (edits_split[pos])
                fixed_edit = edits_split[pos]
                mut_codon = wt_codon[0:2]
                mut_snvs = mutate_snvs(mut_codon)

                fixed_edit_aa = []
                impossible = []
                
                for elem in mut_snvs:
                    full_mut_codon = elem + fixed_edit
                    normalized_pos, mut_codon_snv = string_compare(full_mut_codon[0:2], wt_codon[0:2])
                    if normalized_pos == 0:
                        mut_codon_pos = pos - 2
                    elif normalized_pos == 1:
                        mut_codon_pos = pos -1
                        
                    print('mut codon', full_mut_codon, mut_codon_pos, normalized_pos)
                    mut_codon_pos_id = target + ':' + str(mut_codon_pos) + ':' + mut_codon_snv

                    var = Seq(full_mut_codon)
                    aa = str(var.translate())
                    fixed_edit_aa.append(aa)
                    if aa in possible_aa:
                        continue
                    else:
                        impossible.append(aa)
                        to_filter.append(mut_codon_pos_id)
                                         
                print('3 - ', target, pos, possible_aa, fixed_edit_aa, impossible)
                print('4 - ', len(impossible), len(to_filter))
    

    return to_filter

In [None]:
def filter_data(sge, to_filter): #Filters out variants

    #Builds unique identifier into raw SGE data
    sge['pos'] = sge['pos'].astype(str) #Sets 'pos' to string
    sge['pos_id'] = sge['pos'] + ':' + sge['alt'] #Builds position ID
    sge['target_pos_id'] = sge['target']+ ':' +  sge['pos_id'] #Builds target into position ID for filtering (needed to filter/skip variants covered in overlapping targets)
    sge['pos'] = sge['pos'].astype(int) #Resets 'pos' to integer datatype   

    filtered = sge.loc[~((sge['target_pos_id'].isin(to_filter)) & (sge['amino_acid_change'] != '---'))] #Variants with correct target position ID and in coding sequence filtered

    filtered = filtered.copy() #Copy made

    #Adds columns for functional classification based on 0.95 and 0.99 GMM density cutoffs
    filtered['gmm_consequence_0.95'] = 'indeterminate'
    filtered.loc[filtered['gmm_density_abnormal'] >= 0.95, 'gmm_consequence_0.95'] = 'functionally_abnormal'
    filtered.loc[filtered['gmm_density_normal'] >= 0.95, 'gmm_consequence_0.95'] = 'functionally_normal'

    filtered['gmm_consequence_0.99'] = 'indeterminate'
    filtered.loc[filtered['gmm_density_abnormal'] >= 0.99, 'gmm_consequence_0.99'] = 'functionally_abnormal'
    filtered.loc[filtered['gmm_density_normal'] >= 0.99, 'gmm_consequence_0.99'] = 'functionally_normal'
    
    return filtered

In [None]:
def main():
    ref, raw_data, targets = read_inputs(ref_path, sge_scores, targets_input) #Reads inputs
    all_edits = get_edits(targets) #Gets edits
    to_filter = filter(all_edits, raw_data, ref, ref_sense) #Gets variants to filter
    filtered_data = filter_data(raw_data, to_filter) #Variants filtered


    filtered_data.to_excel(filtered_file_name, index = False) #Filtered data saved
    print(to_filter) #Variants to filter printed

    #Prints filtering statistics
    print('Filtering statistics: ', '\n',
            'RAW DATA: ', len(raw_data), '\n',
           'VARS. TO FILTER: ', len(to_filter), '\n',
            'POST FILTER: ', len(filtered_data), '\n'
         )

In [None]:
main()