In [None]:
import pandas as pd
import altair as alt
import numpy as np
from pathlib import Path
import re
alt.data_transformers.disable_max_rows()

In [None]:
data_folder = '../Data/QC_dev_data/counts' #Path to data with counts

In [None]:
def read_data(data): #Reads the data
    data_path = Path(data) #Creates path object
    all_data = list(data_path.glob('*tsv')) #Gets list of all files in folder

    dfs = [] #Empty list to hold dataframes for each gene

    #Iterates through all_data list and reads .TSV
    for data in all_data:
        df = pd.read_csv(data, sep = '\t')
        dfs.append(df)

    return dfs

In [None]:
def process_dfs(dfs): #Processes dataframes - gets mininum read counts

    clean_dfs = [] #Empty list to hold processed dataframes

    #Iterates through provided list of dataframes and does processing
    for df in dfs:
        new_columns = [('D5_min_count', 'D13_min_count'), ('D5_median_count','D13_median_count')] #List of tuples for new columns 
        count_cols = [['D05_R1_lib1','D05_R1_lib2','D05_R2_lib1','D05_R2_lib2', 'D05_R3_lib1', 'D05_R3_lib2'], ['D13_R1_lib1', 'D13_R1_lib2', 'D13_R2_lib1', 'D13_R2_lib2', 'D13_R3_lib1', 'D13_R3_lib2']] #Column names of columns wiht counts
        all_count_cols = count_cols[0] + count_cols[1] #All count columns

        #Iterates through day 5 and day 13 columns and determines min. and median read count
        for d5, d13 in new_columns:
            if 'min' in d5:
                df[d5] = df[count_cols[0]].min(axis = 1)
                df[d13] = df[count_cols[1]].min(axis = 1)
            elif 'median' in d5:
                df[d5] = df[count_cols[0]].median(axis = 1)
                df[d13] = df[count_cols[1]].median(axis = 1)
                
        df['min_count'] = df[all_count_cols].min(axis = 1) #Gets global minimum read count across both timepoints
        df['median_count'] = df[all_count_cols].median(axis = 1) #Gets global median read count across both timepoints

        gene = df['exon'][0].split('_')[0] #Gets gene name
        df['Gene'] = gene #Makes gene name column
        df = df[['chrom', 'pos', 'ref', 'allele', 'exon', 'simplified_consequence', 'score', 'functional_consequence', 'standard_error', 'D5_min_count', 'D13_min_count', 'D5_median_count', 'D13_median_count']] #Keeps useful columns
        
        clean_dfs.append(df) #Appends to empty list
    
    return dfs

In [None]:
def analysis_plots(dfs): #Creates analysis plots
    
    concat_df = pd.concat(dfs) #Concatenates all dataframes

    concat_df = concat_df[concat_df['Gene'].isin(['BARD1'])] #Line used to extract one or a select few genes only

    concat_df['exon_sort'] = concat_df['exon'].apply(lambda x: int(re.search(r'X(\d+)', x).group(1)) if re.search(r'X(\d+)', x) else 0) #Creates exon number column for facet sorting

    #Creates std. error vs. minimum D5 read count plot
    min_plot = alt.Chart(concat_df).mark_point().encode(
        x = alt.X('D5_min_count',title = 'Min. D5 Count'),
        y = alt.Y('standard_error', title = 'Std. Error'),
        color = alt.Color('simplified_consequence', legend = alt.Legend(title = 'Consequence')),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet('Gene'
        ).resolve_scale(
            x = 'independent'
    ).interactive()


    #Creates std. error vs. median D5 read count plot
    med_plot = alt.Chart(concat_df).mark_point().encode(
        x = alt.X('D5_median_count',title = 'Med. D5 Count'),
        y = alt.Y('standard_error', title = 'Std. Error'),
        color = alt.Color('simplified_consequence', legend = alt.Legend(title = 'Consequence')),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_median_count', title = 'Med. D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet('Gene'
        ).resolve_scale(
            x = 'independent'
    ).interactive()

    #Concatenates and displays min. and median plots
    plots = alt.vconcat(min_plot, med_plot)
    plots.display() 

    #Creates std. error vs. min. D5 read count faceted by variant consequence
    consequences = alt.Chart(concat_df).mark_point().encode(
        x = 'D5_min_count',
        y = 'standard_error',
        color = 'simplified_consequence',
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet(column = 'simplified_consequence',
            row = 'Gene'
           )

    consequences.display()

    #Creates std. error vs. min. D5 read count faceted by functional consequence
    function = alt.Chart(concat_df).mark_point().encode(
        x = 'D5_min_count', 
        y = 'standard_error',
        color = 'simplified_consequence',
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400,
        width = 600
    ).facet(row = 'Gene', 
            column = 'functional_consequence'
           )
    function.display()

    #Creates std. error vs. SGE score
    sge_plot = alt.Chart(concat_df).mark_point().encode(
        x = alt.X('score',title = 'SGE Score'),
        y = alt.Y('standard_error', title = 'Std. Error'),
        color = alt.Color('simplified_consequence', legend = alt.Legend(title = 'Consequence')),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet('Gene'
        ).resolve_scale(
            x = 'independent'
    ).interactive()

    sge_plot.display()

    #Creates std. error vs. min. D5 read count faceted by exon
    byexon_plot = alt.Chart(concat_df).mark_point().encode(
        x = alt.X('D5_min_count',title = 'D5 Min. Count'),
        y = alt.Y('standard_error', title = 'Std. Error'),
        color = alt.Color('simplified_consequence', legend = alt.Legend(title = 'Consequence')),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet(facet = alt.Facet('exon', sort = {'field': 'exon_sort', 'op': 'min', 'order': 'ascending'})
        ).resolve_scale(
            x = 'independent'
    ).interactive()

    byexon_plot.display()


In [None]:
def main():
    dfs = read_data(data_folder)
    final_dfs = process_dfs(dfs)
    analysis_plots(final_dfs)

In [None]:
main()