In [None]:
import pandas as pd
import altair as alt
import numpy as np

In [None]:
file = '/Users/ivan/Downloads/BARD1.snvscores.tsv' #path to SGE scores
ref_path = '../Data/SNV_filtering_inputs/20240809_BARD1_SNVlib_ref_seqs_intron_annotated.xlsx' #path to reference sequence
coord_file = '../Data/SNV_filtering_inputs/20250415_BARD1_filter_entry.xlsx'
#coords = [(214809,214809500),(214797050,214797156)] #genomic coordinates for exon to make map

In [None]:
def get_region_coords(file):
    df = pd.read_excel(file, sheet_name = 'targets')

    coords = []

    i = 0
    while i < len(df):
        target = df['target'][i]
        target_start = df['start'][i]
        target_end = df['end'][i]

        start_end = (target_start, target_end)
        full_tuple = (target, start_end)

        coords.append(full_tuple)

        i += 1
        
    return coords

In [None]:
def read_scores(file,region): #reads scores
    
    data = pd.read_csv(file, sep = '\t')
    data = data.rename(columns = {'simplified_consequence': 'Consequence', 'score': 'snv_score_minmax'})
    data['pos'] = data['pos'].astype(str)
    data['pos_id'] = data['pos'] + ':' + data['allele']
    
    data = data[['exon','target','pos', 'pos_id', 'Consequence', 'snv_score_minmax', 'amino_acid_change', 'functional_consequence']]
    data = data.loc[data['target'].isin([region])]
    
    return data

In [None]:
def get_reference(ref, coords,region): #pulls out reference sequence
    start, end = coords

    list_coords = []
    for i in range(start, end + 1):
        list_coords.append(i)
    
    ref = pd.read_excel(ref)
    ref = ref.loc[ref['target'].isin([region])]
    ref = ref[['target', 'Reference', 'pos']]
    x_coord = ref.loc[ref['pos'].isin(list_coords)]
    
    return x_coord
    

In [None]:
def reverse_complement(seq_string):
    reverse_seq = seq_string[::-1]
    reverse_comp_list = []
    for char in reverse_seq:
        if char == "A":
            reverse_comp_list.append("T")
        elif char == "G":
            reverse_comp_list.append("C")
        elif char == "C":
            reverse_comp_list.append("G")
        else:
            reverse_comp_list.append("A")
    reverse_compliment_str = "".join(reverse_comp_list)
    return reverse_compliment_str

In [None]:
def reverse_comp_ref(x_ref): #reverse complements reference for antisense gene

    ref_list = x_ref['Reference'].tolist()
    ref_string = ''.join(ref_list)

    reversed = reverse_complement(ref_string)

    reversed_ref = []
    for char in reversed:
        reversed_ref.append(char)

    x_ref = x_ref[::-1].reset_index(drop = True)


    x_ref['Reference'] = reversed_ref

    x_ref_reversed = x_ref

    return x_ref_reversed

In [None]:
def row_enumerate_ref(ref_df): #enumerates each row for heat map and each column for the base pair number
    ref_df['Row'] = None

    ref_df.loc[ref_df['Reference'] == 'A', 'Row'] = 'A'
    ref_df.loc[ref_df['Reference'] == 'C', 'Row'] = 'C'
    ref_df.loc[ref_df['Reference'] == 'G', 'Row'] = 'G'
    ref_df.loc[ref_df['Reference'] == 'T', 'Row'] = 'T'

    bp_num = []

    for i in range(len(ref_df)):
        bp_num.append(i)

    ref_df['Column'] = bp_num

    return ref_df
    

In [None]:
def reverse_posid(string): #to reverse complement pos_id for antisense gene
    split = string.split(':')
    reversed = reverse_complement(split[1])

    split[1] = reversed

    reversed_id = ':'.join(split)
    
    return reversed_id

In [None]:
def process_data(df): #groups consequence of SNVs, adds reversed IDs
    df = df.reset_index(drop = True)
  
    df.loc[df['Consequence'] == 'missense_variant', 'Consequence'] = 'Missense'
    df.loc[df['Consequence'] == 'synonymous_variant', 'Consequence'] = 'Synonymous'
    df.loc[df['Consequence'] == 'intron_variant', 'Consequence'] = 'Intronic'
    df.loc[(df['Consequence'] == 'stop_gained') | (df['Consequence'] == 'stop_lost') | (df['Consequence'] == 'stop_retained_variant'), 'Consequence'] = 'Stop'
    df.loc[(df['Consequence'] == 'splice_polypyrimidine_tract_variant') |(df['Consequence'] == 'splice_region_variant') | (df['Consequence'] == 'splice_acceptor_variant') | (df['Consequence'] == 'splice_donor_region_variant') | (df['Consequence'] == 'splice_donor_5th_base_variant') | (df['Consequence'] == 'splice_donor_variant'),'Consequence'] = 'Splice Region'
    df.loc[df['Consequence'] == '3_prime_UTR_variant', 'Consequence'] = 'UTR'
    
    i = 0
    reversed_ids = []
    while i < len(df):
        id = df['pos_id'][i]
        reversed_id = reverse_posid(id)
        reversed_ids.append(reversed_id)
        
        i += 1
    df['pos_id'] = reversed_ids

    path_max = 0.6 #SGE scores used to create each score group
    unchanged_max = 1.1
    
    i = 0 
    while i < len(df):
        score = df['snv_score_minmax'][i]
        id = df['pos_id'][i]
        if score < path_max:
            df.loc[df['pos_id'] == id, 'Function Type'] = 'Depleted'
        elif path_max < score < unchanged_max:
            df.loc[df['pos_id'] == id, 'Function Type'] = 'Unchanged'
        elif score > unchanged_max:
            df.loc[df['pos_id'] == id, 'Function Type'] = 'Enriched'
        i += 1
    return df
    
    

In [None]:
def row_enumerate_data(df): #enumerates row in SGE data for base change
    df['Row'] = None

    i = 0
    while i < len(df):
        
        id = df['pos_id'][i]
        split_id = id.split(":")
        change = split_id[1]

        if change == 'T':
            df.loc[df['pos_id'] == id, 'Row'] = 'T'
        elif change == 'A':
            df.loc[df['pos_id'] == id, 'Row'] = 'A'
        elif change == 'C':
            df.loc[df['pos_id'] == id, 'Row'] = 'C'
        elif change == 'G':
            df.loc[df['pos_id'] == id, 'Row'] = 'G'
        i += 1

    df = df[::-1].reset_index(drop = True)
    return df
    

In [None]:
def column_enumerate_data(df,refdf): #enumerates column for basepair that was changed
    column_dict = {} #dictionary to store the column number for each genomic coordinate

    i = 0
    while i < len(refdf): #makes the dictionary
        coord = refdf['pos'][i]
        col = refdf['Column'][i]

        column_dict[coord] = col

        i += 1

    df['Column'] = np.nan #empty column to hold column values

    j = 0
    while j < len(df): #assigns the column values
        id = df['pos_id'][j]
        split = id.split(':')
        coord = int(split[0])
        col = column_dict[coord]

        df.loc[df['pos_id'] == id, 'Column'] = col

        j += 1

    return df

In [None]:
def heatmap(data, letters,region):
    # Filter out the cells that will display letters from the heatmap dataset
    heatmap_data = data.merge(letters, on=['Row', 'Column'], how='left', indicator=True)
    heatmap_data = heatmap_data[heatmap_data['_merge'] == 'left_only'].drop(columns=['Reference', '_merge'])
    
    # Define specific colors for background value ranges
    color_domain = [-2.1, 0, 0.6, 1, 2]
    color_range = ['#d73027', '#fc8d59', '#fee08b', '#d9ef8b', '#1a9850']
    
    # Define the rectangle size and spacing
    rect_size = 15
    spacing = 7.5

    total_width = (rect_size + spacing) * len(letters) - spacing
    total_height = (rect_size + spacing) * len(data['Row'].unique()) - spacing

    target = region.split('_')
    title_s = 'Exon ' + target[1]

    #title_c = alt.Chart(pd.DataFrame({'text': [title_str]})).mark_text(
       #align='left',
        #baseline='middle',
        #fontSize=20
        #).encode(
            #text='text:N'
        #).properties(
            #width=30,  # Adjust width to position the title correctly
            #height=total_height
        #)

    # Create the background heatmap with borders
    background = alt.Chart(heatmap_data).mark_rect(
        width=rect_size,
        height=rect_size,
        strokeWidth=2
    ).encode(
        x=alt.X('Column:N', title='Basepair', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Row:N', title='SNV'),
        color=alt.Color('snv_score_minmax:Q', title='SGE Score', scale=alt.Scale(domain=color_domain, range=color_range)),
        stroke=alt.Stroke('Consequence:N', title='Consequence', legend = alt.Legend(symbolFillColor = 'white'))
    ).properties(
        width= total_width,
        height= total_height
    )
    
    # Create the text overlay
    text = alt.Chart(letters).mark_text(
        align='center',
        baseline='middle',
        fontSize=14
    ).encode(
        x=alt.X('Column:N', title='Basepair', axis=alt.Axis(labelAngle=0)),
        y=alt.Y('Row:N', title='SNV'),
        text='Reference:N',
        color=alt.value('black')
    )
    
    # Combine the background and text
    heatmap = background + text

    heatmap = heatmap.properties(
        title = title_s
    )
    # Display the chart
    heatmap.show()

    return heatmap

In [None]:
def main():
    all_maps = []
    coords = get_region_coords(coord_file)
    
    for elem in coords:
        region, ref_coords = elem
        data = read_scores(file,region)
        ref = get_reference(ref_path, ref_coords, region)
        print(ref)
        ref_reversed = reverse_comp_ref(ref)
        ref_enumerated = row_enumerate_ref(ref_reversed)
        sge_data_clean = process_data(data)
        sge_row_enumerated = row_enumerate_data(sge_data_clean)
        sge_ready = column_enumerate_data(sge_row_enumerated, ref_enumerated)
        map = heatmap(sge_ready, ref_enumerated,region)
        all_maps.append(map)

    i = 1
    map_1 = all_maps[0]
    while i < len(all_maps):
        if i == 1:
            joined = alt.vconcat(map_1,all_maps[1])
        elif 1 < i < len(all_maps):
            joined = alt.vconcat(joined,all_maps[i])
        else:
            joined = joined.configure_view(
                    stroke = None
            )

        i += 1
    
    joined.show()

In [None]:
main()