In [None]:
import pandas as pd
import altair as alt
import numpy as np
from pathlib import Path
import re
alt.data_transformers.disable_max_rows()

In [None]:
data_folder = '../Data/QC_dev_data/counts'

In [None]:
def read_data(data):
    data_path = Path(data)
    all_data = list(data_path.glob('*tsv'))

    dfs = []

    for data in all_data:
        df = pd.read_csv(data, sep = '\t')
        dfs.append(df)

    return dfs

In [None]:
def process_dfs(dfs):

    clean_dfs = []
    for df in dfs:
        new_columns = [('D5_min_count', 'D13_min_count'), ('D5_median_count','D13_median_count')]
        count_cols = [['D05_R1_lib1','D05_R1_lib2','D05_R2_lib1','D05_R2_lib2', 'D05_R3_lib1', 'D05_R3_lib2'], ['D13_R1_lib1', 'D13_R1_lib2', 'D13_R2_lib1', 'D13_R2_lib2', 'D13_R3_lib1', 'D13_R3_lib2']]
        all_count_cols = count_cols[0] + count_cols[1]
        for d5, d13 in new_columns:
            if 'min' in d5:
                df[d5] = df[count_cols[0]].min(axis = 1)
                df[d13] = df[count_cols[1]].min(axis = 1)
            elif 'median' in d5:
                df[d5] = df[count_cols[0]].median(axis = 1)
                df[d13] = df[count_cols[1]].median(axis = 1)
                
        df['min_count'] = df[all_count_cols].min(axis = 1)
        df['median_count'] = df[all_count_cols].median(axis = 1)

        gene = df['exon'][0].split('_')[0]
        df['Gene'] = gene
        df = df[['chrom', 'pos', 'ref', 'allele', 'exon', 'simplified_consequence', 'score', 'functional_consequence', 'standard_error', 'D5_min_count', 'D13_min_count', 'D5_median_count', 'D13_median_count']]
        clean_dfs.append(df)
    
    return dfs

In [None]:
def analysis_plots(dfs):
    
    concat_df = pd.concat(dfs)

    concat_df = concat_df[concat_df['Gene'].isin(['BARD1'])]

    concat_df['exon_sort'] = concat_df['exon'].apply(lambda x: int(re.search(r'X(\d+)', x).group(1)) if re.search(r'X(\d+)', x) else 0)
    
    min_plot = alt.Chart(concat_df).mark_point().encode(
        x = alt.X('D5_min_count',title = 'Min. D5 Count'),
        y = alt.Y('standard_error', title = 'Std. Error'),
        color = alt.Color('simplified_consequence', legend = alt.Legend(title = 'Consequence')),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet('Gene'
        ).resolve_scale(
            x = 'independent'
    ).interactive()

    med_plot = alt.Chart(concat_df).mark_point().encode(
        x = alt.X('D5_median_count',title = 'Med. D5 Count'),
        y = alt.Y('standard_error', title = 'Std. Error'),
        color = alt.Color('simplified_consequence', legend = alt.Legend(title = 'Consequence')),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_median_count', title = 'Med. D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet('Gene'
        ).resolve_scale(
            x = 'independent'
    ).interactive()

    plots = alt.vconcat(min_plot, med_plot)
    plots.display()

    consequences = alt.Chart(concat_df).mark_point().encode(
        x = 'min_count',
        y = 'standard_error',
        color = 'simplified_consequence',
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('min_count', title = 'Min D5/D13 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet(column = 'simplified_consequence',
            row = 'Gene'
           )

    consequences.display()

    function = alt.Chart(concat_df).mark_point().encode(
        x = 'min_count', 
        y = 'standard_error',
        color = 'simplified_consequence',
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400,
        width = 600
    ).facet(row = 'Gene', 
            column = 'functional_consequence'
           )
    function.display()

    sge_plot = alt.Chart(concat_df).mark_point().encode(
        x = alt.X('score',title = 'SGE Score'),
        y = alt.Y('standard_error', title = 'Std. Error'),
        color = alt.Color('simplified_consequence', legend = alt.Legend(title = 'Consequence')),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet('Gene'
        ).resolve_scale(
            x = 'independent'
    ).interactive()

    sge_plot.display()

    byexon_plot = alt.Chart(concat_df).mark_point().encode(
        x = alt.X('D5_min_count',title = 'D5 Min. Count'),
        y = alt.Y('standard_error', title = 'Std. Error'),
        color = alt.Color('simplified_consequence', legend = alt.Legend(title = 'Consequence')),
        tooltip = [alt.Tooltip('exon', title = 'Exon: '),
                   alt.Tooltip('score', title = 'SGE Score: '),
                   alt.Tooltip('functional_consequence', title = 'Func. Consequence: '),
                   alt.Tooltip('standard_error', title = 'Std. Error: '),
                   alt.Tooltip('D5_min_count', title = 'Min D5 Count: ')
                  ]
    ).properties(
        height = 400, 
        width = 600
    ).facet(facet = alt.Facet('exon', sort = {'field': 'exon_sort', 'op': 'min', 'order': 'ascending'})
        ).resolve_scale(
            x = 'independent'
    ).interactive()

    byexon_plot.display()


In [None]:
def main():
    dfs = read_data(data_folder)
    final_dfs = process_dfs(dfs)
    analysis_plots(final_dfs)

In [None]:
main()