In [None]:
import pandas as pd
import altair as alt
from pathlib import Path
import re

In [None]:
data_path = '../Data/QC_dev_data/' #Path to data
alt.data_transformers.disable_max_rows()

In [None]:
def read_data(data): #reads RNA/DNA count files and merges them
               
    data_path = Path(data) #Creates path variable
    all_data = list(data_path.glob('*.tsv')) #Gets list of all TSV files in folder

    all_dfs = [] #Empty list to hold dataframes for each gene dataset

    #Iterates through all_data list and gets dataframe for each gene
    for elem in all_data:
        df = pd.read_csv(elem, sep = '\t') #Reads TSV
        df = df[['pos', 'simplified_consequence', 'standard_error', '95_ci_upper', '95_ci_lower','exon', 'score']] #Keeps useful columns
        df = df.dropna(subset = ['simplified_consequence']) #Drops NA values from simplified consequence
        all_dfs.append(df)

    return all_dfs

In [None]:
def std_error_vs_coord(list_df): #Creates plot of std. error vs. genomic coordinate

    path = '/Users/ivan/Desktop/QC_plots/' #Path to plot output folder

    #Iterates through each provided dataframe and generates a std. error vs. genomic coordinate plot for each gene
    for gene in list_df:
        
        gene['exon_sort'] = gene['exon'].apply(lambda x: int(re.search(r'X(\d+)', x).group(1)) if re.search(r'X(\d+)', x) else 0) #Regex search to get exon number
        gene_name = gene['exon'][0].split('_')[0] #Gets gene name for final plot saving
  
        #Builds std. error vs. genomic coordinate plot faceted by exon number
        plot = alt.Chart(gene).mark_circle().encode(
            x = alt.X('pos:O', axis = alt.Axis(labels = False, title = 'Genomic Coordinate')),
            y = alt.Y('standard_error:Q', axis = alt.Axis(title = 'Standard Error')),
            color = alt.Color('simplified_consequence', legend = alt.Legend(title = 
                                                                            'Consequence'
                                                                           )),
            tooltip = [alt.Tooltip('pos', title = 'Coordinate: '),
                       alt.Tooltip('standard_error', title = 'Standard Error: '),
                       alt.Tooltip('score', title = 'SGE Score: '),
                       alt.Tooltip('simplified_consequence', title = 'Consequence: ')
                      ]
        ).properties(
            height = 400,
            width = 600
        ).facet(facet = alt.Facet('exon', title = 'Std. Error vs. Genomic Coordinate', sort = {'field': 'exon_sort', 'op': 'min', 'order': 'ascending'}),
                columns = 3
        ).resolve_scale(
            x = 'independent'
        )

        full_path = path + gene_name + '.png' # Builds full path for final plot to be saved

        #plot.save(full_path) #Saves plot
        plot.display()

In [None]:
def main():
    df_list = read_data(data_path)
    std_error_vs_coord(df_list)

In [None]:
main()