In [None]:
# Import necessary modules
import pandas as pd
from Bio import SeqIO

# Path to the .fa file
fasta_path = '../Data/SNV_filtering_inputs/PillarProject_filtering/PALB2/20250822_PALB2.fasta'
entry_file = '../Data/SNV_filtering_inputs/PillarProject_filtering/PALB2/20250822_PALB2_filter_entry.xlsx' #Entry file contains 3 sheets: SGE targets with start/end coordinates, start/end coordinates for exons of the gene, start/end coordinates and sense (0 - anti, 1 - sense) of provided fasta file
final_name = '../Data/SNV_filtering_inputs/PillarProject_filtering/PALB2/20250822_PALB2_PillarProjectRef' + '.xlsx' #Final path for reference

In [None]:
def read_inputs(file): #Reads input files into list of tuples
    
    lib_df = pd.read_excel(file, sheet_name = 'targets') #Reads targets sheets with information about each SGE target
    exon_df = pd.read_excel(file, sheet_name = 'exons') #Reads exons sheet with information about each exon of the gene
    start_end = pd.read_excel(file, sheet_name = 'gene') #Reads gene sheet with information about provided fasta

    libs =  [] #Empty list to hold tuples that are (SGE target, target start coordinate, target end coordinate)
    i = 0
    while i < len(lib_df):
        target = lib_df['target'][i]
        target_start = lib_df['start'][i]
        target_end = lib_df['end'][i]

        handoff_tuple = (target, target_start, target_end)
        libs.append(handoff_tuple)

        i += 1

    exons = [] #Empty list to tuples that are (exon start coordinate, exon end coordinate)
    j = 0
    while j < len(exon_df):
        exon_start = exon_df['start'][j]
        exon_end = exon_df['end'][j]

        exon_tuple = (exon_start, exon_end)

        exons.append(exon_tuple)
        j += 1

    start = start_end['start'][0] #Gets fasta start
    end = start_end['end'][0] #Gets fasta end
    sense = start_end['sense'][0] #Gets gene sense
    
    return libs, exons, start, end, sense

In [None]:
# Parse the .fa file
def parse(fasta_path, libs, start, end, sense):
    start = start
    end = end

    if sense == 0:
        #creates lists of coordinates (adj is to pull from fasta, not adj is genomic) and region names
        lib_coords_adj = []
        lib_coords = []
        regions = []

        
        for elem in libs: 
            region, lib_start, lib_end = elem
            lib_coords.append(lib_start)
            lib_coords.append(lib_end)
            
            pos_adjusted_start = start - lib_start
            pos_adjusted_end = start - lib_end
            lib_coords_adj.append(pos_adjusted_start)
            lib_coords_adj.append(pos_adjusted_end)
            regions.append(region)

        #creates list of library sequences from fasta
        list_lib_seq = []
        for record in SeqIO.parse(fasta_path, "fasta"):
            i = 0
            while i < len(lib_coords):
                lib_seq = record.seq[lib_coords_adj[i] - 1:lib_coords_adj[i+1]] #indexing of lib_coords_adj allows for reading every pair of coordinates
                lib_seq = str(lib_seq.complement())[::-1] #reverse complements for antisense sequence
                list_lib_seq.append(str(lib_seq)) #appends sequence to list
                print(lib_seq)
                i += 2 #increment of 2 to account for reading each adjacent pair of coordinates
        
        #creates list of data frames where each dataframe contains a region with its reference and genomic coordinates at each bp
        df_list = []
        j = 0
        z = 0
        while j < len(regions):
            name = regions[j] + '_df' #name of df
            region_name = regions[j] #region name
            bp = [] #list where each element is a base pair
            coords = [] #list where each element is a coordinate
            
            seq = list_lib_seq[j] #pulls out sequence from list of sequences created by previous loop
            for char in seq: #iterates through sequence and appends each character to bp list
                bp.append(char)
                
            start = lib_coords[z] #gets start coordinate from list of coordinates
            end = lib_coords[z+1] #gets end coordinate from list of coordinates
            for k in range(end, start + 1): #Creates coordinates (end and start flipped for gene on antisense strand after reverse compliment)
                coords.append(k)
    
            #creates and appends dataframe to list
            print(region_name, len(bp),len(coords))
            name = pd.DataFrame({'target': region_name, 'Reference': bp, 'pos': coords})
            df_list.append(name)
            
            j += 1
            z += 2
            
        final_ref = pd.concat(df_list)

    elif sense == 1:
        
        #creates lists of coordinates (adj is to pull from fasta, not adj is genomic) and region names
        lib_coords_adj = []
        lib_coords = []
        regions = []
        for elem in libs: 
            region, lib_start, lib_end = elem
            lib_coords.append(lib_start)
            lib_coords.append(lib_end)
            
            pos_adjusted_start = -(start - lib_start)
            pos_adjusted_end = -(start - lib_end)
            lib_coords_adj.append(pos_adjusted_start)
            lib_coords_adj.append(pos_adjusted_end)
            regions.append(region)

        #creates list of library sequences from fasta
        list_lib_seq = []
        for record in SeqIO.parse(fasta_path, "fasta"):
            i = 0
            while i < len(lib_coords):
                lib_seq = record.seq[lib_coords_adj[i]:lib_coords_adj[i+1] + 1] #indexing of lib_coords_adj allows for reading every pair of coordinates
                #lib_seq = str(lib_seq.complement())[::-1] #reverse complements for antisense sequence
                list_lib_seq.append(str(lib_seq)) #appends sequence to list
                print(region, lib_seq)
                i += 2 #increment of 2 to account for reading each adjacent pair of coordinates
                
        df_list = []
        j = 0
        z = 0
        while j < len(regions):
            name = regions[j] + '_df' #name of df
            region_name = regions[j] #region name
            bp = [] #list where each element is a base pair
            coords = [] #list where each element is a coordinate
            
            seq = list_lib_seq[j] #pulls out sequence from list of sequences created by previous loop
            for char in seq: #iterates through sequence and appends each character to bp list
                bp.append(char)
                
            start = lib_coords[z] #gets start coordinate from list of coordinates
            end = lib_coords[z+1] #gets end coordinate from list of coordinates

            for k in range(start, end + 1): #Creates coordinates for each region
                coords.append(k)
    
            #creates and appends dataframe to list
            name = pd.DataFrame({'target': region_name, 'Reference': bp, 'pos': coords})
            df_list.append(name)
            
            j += 1
            z += 2
            
        final_ref = pd.concat(df_list)

    return final_ref

In [None]:
def annotate_introns(df, exons):
    df['Intron/Exon'] = None
    
    exon_coords = []
    for elem in exons:
        start, end = elem
        for i in range(start, end + 1):
            exon_coords.append(i)

    df.loc[df['pos'].isin(exon_coords),'Intron/Exon'] = 'Exon'
    df.loc[~df['pos'].isin(exon_coords),'Intron/Exon'] = 'Intron'

    return df


In [None]:
#saves to excel
def save_to_xlsx(df):
    df.to_excel(final_name, index = False)


In [None]:
def main():
    libs, exons, start, end, sense = read_inputs(entry_file)
    ref_seqs = parse(fasta_path, libs, start, end, sense)
    annotated_ref = annotate_introns(ref_seqs, exons)
    save_to_xlsx(ref_seqs)
    print(annotated_ref)

In [None]:
main()