In [3]:
import pandas as pd
import altair as alt
import sys


In [4]:
def create_df(highest_rank, compare, successTable):
    x = range(1,highest_rank)
    denom = len(successTable)
    data=[]
    max_data = []
    for runtype in compare:
        success = successTable.loc[(successTable['Variant_Level_noMOI_'+str(runtype)]=='Variant_Present_noMOI') | (successTable['Variant_Level_noMOI_'+str(runtype)]=='Variant_Present_noMOI')]
        for i in x:
            num = len(success.loc[success['Variant_Level_noMOI_rank_'+str(runtype)] <= i])
            data.append([i,num, (num/denom)*100, runtype])
        print(runtype,'highest rank:', max(success['Variant_Level_noMOI_rank_'+str(runtype)]), 'max%:', (num/denom)*100, num)
        max_data.append([runtype,num, (num/denom)*100 ])

    df = pd.DataFrame(data, columns=['Rank', 'NumPatients', 'Percent_Variants', 'Run_Type'])
    return df, denom, max_data

In [5]:
def create_plot(denom, source, compare, color_scheme, domain):
    title=str(denom) +'Diagnostic Variants (Variant Level - no MOI requirement)'
    bigChart = alt.Chart(source, title=title).mark_line().encode(
        x=alt.X('Rank', title='Exomiser Rank of Causal Variant'),
        y=alt.Y('Percent_Variants', title='Percent of Causal Variants within Exomiser Rank', scale=alt.Scale(domain=[0,100])),
        color=alt.Color('Run_Type:N',sort=compare,scale=alt.Scale(domain=domain,scheme=color_scheme)),
        tooltip=['Rank', 'Percent_Variants', 'Run_Type']
    ).properties(
        width=500,
        height=400).interactive()
    
    zoom_source = source.loc[source['Rank'] <=30]

    zoomChart = alt.Chart(zoom_source, title=title).mark_line(point=alt.OverlayMarkDef(size=50)).encode(
        x=alt.X('Rank', title='Exomiser Rank of Causal Variant'),
        y=alt.Y('Percent_Variants', title='Percent of Causal Variants within Exomiser Rank', scale=alt.Scale(domain=[0,100])),
        color=alt.Color('Run_Type:N', sort=domain, scale=alt.Scale(domain=domain,scheme=color_scheme)),
        tooltip=['Rank', 'Percent_Variants', 'Run_Type']
    ).properties(
        width=400,
        height=300)
    plot=alt.vconcat(bigChart, zoomChart)#.configure_legend(labelLimit=0).configure_axis(
        # labelFontSize=15,
        # titleFontSize=15).configure_legend(labelLimit=0,labelFontSize=15, titleFontSize=15)#.configure_axis(grid=False)
    plot=alt.vconcat(zoomChart)
    return plot


In [7]:
success_table = pd.read_csv('../../02_exomiser_manuscript/supp_fig_20/MAF_input.tsv', sep='\t')
print(len(set(success_table['ID'])),'genes')
print(len(success_table), 'variants')
mapping = pd.read_csv('../../02_exomiser_manuscript/GS_ID_mapping.csv')

mapped= []
for i, row in success_table.iterrows():
    ID = row['ID']
    mapped_id = mapping[mapping['ID']==ID]['Dumb_ID'].item()
    mapped.append(mapped_id)
success_table['Dumb_ID'] = mapped

239 genes
296 variants


## pop freq sources

In [10]:
run_types = ['filtered_default_no_UK10K','filtered_default_founder_pops','filtered_default_uk10k_1000G_topmed_esp', 'filtered_default_no_gnomAD','filtered_default_1000G_topmed_esp','noN_filtered_15_85_exomiser_default']
domain = ['noN_filtered_15_85_exomiser_default','filtered_default_founder_pops','filtered_default_uk10k_1000G_topmed_esp','filtered_default_no_UK10K', 'filtered_default_no_gnomAD','filtered_default_1000G_topmed_esp']

gs_exomiser_df,denom,max_data = create_df(300, run_types, success_table)

freq_exomiser = create_plot(denom, gs_exomiser_df, domain, 'category10', domain)
freq_exomiser.configure_axis(
    labelPadding= 5,
    labelLimit=0,
    labelFontSize=15, 
    titleFontSize=15, labelFont='arial', tickSize=8).configure_legend(
        labelLimit=0,labelFontSize=15, titleFontSize=15, labelFont='arial')


filtered_default_no_UK10K highest rank: 223.0 max%: 96.95945945945947 287
filtered_default_founder_pops highest rank: 209.0 max%: 95.94594594594594 284
filtered_default_uk10k_1000G_topmed_esp highest rank: 736.0 max%: 94.25675675675676 279
filtered_default_no_gnomAD highest rank: 736.0 max%: 94.25675675675676 279
filtered_default_1000G_topmed_esp highest rank: 3577.0 max%: 87.5 259
noN_filtered_15_85_exomiser_default highest rank: 218.0 max%: 96.95945945945947 287


## MAFs

In [14]:
run_types = [ 'filtered_default_all_freq_sources_MAF1','filtered_default_all_freq_sources_MAF1.5','filtered_default_all_freq_sources_MAF3','filtered_default_all_freq_sources']
domain=['filtered_default_all_freq_sources', 'filtered_default_all_freq_sources_MAF1','filtered_default_all_freq_sources_MAF1.5','filtered_default_all_freq_sources_MAF3']

gs_exomiser_df,denom,max_data = create_df(300, run_types, success_table)
max_data


filtered_default_all_freq_sources_MAF1 highest rank: 212.0 max%: 96.28378378378379 285
filtered_default_all_freq_sources_MAF1.5 highest rank: 216.0 max%: 96.95945945945947 287
filtered_default_all_freq_sources_MAF3 highest rank: 218.0 max%: 96.95945945945947 287
filtered_default_all_freq_sources highest rank: 218.0 max%: 96.95945945945947 287


[['filtered_default_all_freq_sources_MAF1', 285, 96.28378378378379],
 ['filtered_default_all_freq_sources_MAF1.5', 287, 96.95945945945947],
 ['filtered_default_all_freq_sources_MAF3', 287, 96.95945945945947],
 ['filtered_default_all_freq_sources', 287, 96.95945945945947]]

In [15]:

maf_plot = create_plot(denom, gs_exomiser_df, domain, 'category10', domain)


In [16]:
alt.vconcat(freq_exomiser, maf_plot).resolve_scale(color='independent').configure_axis(
    labelPadding= 5,
    labelLimit=0,
    labelFontSize=15, 
    titleFontSize=15, labelFont='arial', tickSize=8).configure_legend(
        labelLimit=0,labelFontSize=15, titleFontSize=15, labelFont='arial')
