In [None]:
# Import necessary modules
import pandas as pd
from Bio import SeqIO

# Path to the .fa file
file_path = '/Users/ivan/Downloads/bard1.fasta'

#list of Libraries and coordinates
#("Region Name", start, end) - start is higher than end for genes on antisense strand and/or if fasta is sequence on antisense strand
libs = [('BARD1_X1A',214809584,214809476),('BARD1_X1B',214809500,214809379),
        ('BARD1_X2',214797156,214797050),('BARD1_X3A',214792474, 214792354),
        ('BARD1_X3B',214792400,214792286),('BARD1_X4A',214781543,214781431),
        ('BARD1_X4B',214781448,214781334),('BARD1_X4C',214781383,214781206),
        ('BARD1_X4D',214781326,214781202),('BARD1_X4E',214781206,214781087),
        ('BARD1_X4F',214781131,214781012),('BARD1_X4G',214781020,214780906),
        ('BARD1_X4H',214780958,214780857),('BARD1_X4I',214780866,214780767),
        ('BARD1_X4J',214780821,214780697),('BARD1_X4K',214780740,214780627),
        ('BARD1_X4L',214780646,214780532),('BARD1_X5A',214769383,214769267),
        ('BARD1_X5B',214769286,214769166),('BARD1_X6A',214767689,214767579),
        ('BARD1_X6B',214767615,214767507),('BARD1_X6C',214767549,214767430),
        ('BARD1_X7A',214752680,214752442),('BARD1_X7B',214752491,214752370),
        ('BARD1_X8A',214745888,214745778),('BARD1_X8B',214745791,214745675),
        ('BARD1_X9A',214745199,214745103),('BARD1_X9B',214745123,214745004),
        ('BARD1_X10A',214730531,214730428),('BARD1_X10B',214730469,214730353),
        ('BARD1_X11A',214729018,214728899),('BARD1_X11B',214728917,214728818),
        ('BARD1_X11C',214728835,214728716),('BARD1_X11D',214728735,214728632)]


exons = [(214809412,214809569),(214797061,214797117),(214792297,214792445),
         (214780560,214781509),(214769232,214769312),(214767482,214767654),
         (214752447,214752555),(214745722,214745854),(214745067,214745159),
         (214730411,214730508),(214728676,214729008)
        ]

final_name = '20240809_BARD1_SNVlib_ref_seqs_intron_annotated' + '.xlsx'
start = 214809683
end = 214725646

In [None]:
# Parse the .fa file
def parse():
    start = 214809683
    end = 214725646

    #creates lists of coordinates (adj is to pull from fasta, not adj is genomic) and region names
    lib_coords_adj = []
    lib_coords = []
    regions = []
    for elem in libs: 
        region, lib_start, lib_end = elem
        lib_coords.append(lib_start)
        lib_coords.append(lib_end)
        
        pos_adjusted_start = start - lib_start
        pos_adjusted_end = start - lib_end
        lib_coords_adj.append(pos_adjusted_start)
        lib_coords_adj.append(pos_adjusted_end)
        regions.append(region)

    #creates list of library sequences from fasta
    list_lib_seq = []
    for record in SeqIO.parse(file_path, "fasta"):
        i = 0
        while i < len(lib_coords):
            lib_seq = record.seq[lib_coords_adj[i]:lib_coords_adj[i+1] + 1] #indexing of lib_coords_adj allows for reading every pair of coordinates
            lib_seq = str(lib_seq.complement())[::-1] #reverse complements for antisense sequence
            list_lib_seq.append(str(lib_seq)) #appends sequence to list
            i += 2 #increment of 2 to account for reading each adjacent pair of coordinates
    
    #creates list of data frames where each dataframe contains a region with its reference and genomic coordinates at each bp
    df_list = []
    j = 0
    z = 0
    while j < len(regions):
        name = regions[j] + '_df' #name of df
        region_name = regions[j] #region name
        bp = [] #list where each element is a base pair
        coords = [] #list where each element is a coordinate
        
        seq = list_lib_seq[j] #pulls out sequence from list of sequences created by previous loop
        for char in seq: #iterates through sequence and appends each character to bp list
            bp.append(char)
            
        start = lib_coords[z] #gets start coordinate from list of coordinates
        end = lib_coords[z+1] #gets end coordinate from list of coordinates
        for k in range(end, start + 1): #Creates coordinates (end and start flipped for gene on antisense strand after reverse compliment)
            coords.append(k)

        #creates and appends dataframe to list
        name = pd.DataFrame({'target': region_name, 'Reference': bp, 'pos': coords})
        df_list.append(name)
        
        j += 1
        z += 2
        
    final_ref = pd.concat(df_list)   

    return final_ref

In [None]:
def annotate_introns(df):
    df['Intron/Exon'] = None
    
    exon_coords = []
    for elem in exons:
        start, end = elem
        for i in range(start, end + 1):
            exon_coords.append(i)

    df.loc[df['pos'].isin(exon_coords),'Intron/Exon'] = 'Exon'
    df.loc[~df['pos'].isin(exon_coords),'Intron/Exon'] = 'Intron'

    return df


In [None]:
#saves to excel
def save_to_xlsx(df):
    df.to_excel(final_name)


In [None]:
def main():
    ref_seqs = parse()
    annotated_ref = annotate_introns(ref_seqs)
    save_to_xlsx(ref_seqs)
    print(ref_seqs)

In [None]:
main()