In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [3]:

variant_table = pd.read_csv('wgs_input.txt', sep='\t')
num_indiv = len(set(variant_table['UDN_ID']))
print(num_indiv, 'UDN individuals')
variant_table['ID'] = np.array(variant_table['UDN_ID']) + '_' + np.array(variant_table['gene_name'])
print(len(variant_table[['UDN_ID', 'gene_name']].drop_duplicates()), 'diagnostic genes')
print(len(variant_table), 'variants')

404 UDN individuals
430 diagnostic genes
541 variants


In [8]:
family_order=['singleton', 'duo' , 'trio','quad', '>quad']
narratives= ['single heterozygous' ,'compound heterozygous', 'homozygous', 'suspected compound heterozygous']

In [4]:
genomiser_subset = variant_table[(variant_table['Exclude?']=='GENOMISER')][['UDN_ID','ID', 'Exclude?', 'Exclude_Reason', 'Fig_Color','narrative', 'MOI', 'Family_Type', 'for_figure']].drop_duplicates()
for_genomiser_ids = list(genomiser_subset['ID'])
genomiser_variants = variant_table[(variant_table['Exclude?']=='GENOMISER')]


In [5]:
def make_graph(data, num_indiv, color, denom_type, color_order, y_order='-x',height=150, width=300):
    base = alt.Chart(data, title=str(len(data))+ ' '+str(denom_type)+' from ' +str(num_indiv)+ ' UDN Individuals')
    bars=base.mark_bar().encode(
        y=alt.Y(color, sort=y_order, title=None).axis(offset=5, domainOpacity=0),
        x=alt.X('count()', title='Number of Diagnostic '+ str(denom_type)), 
        color=alt.Color(color, sort=color_order,legend=None,scale=alt.Scale(scheme='tableau20', domain=color_order)),
        tooltip = ['count()',color],
    )

    text = base.mark_text(
        align='left',
        baseline='middle',
        dx=3,
       # dy=3,
        color='black',
        size=12
    ).encode(
        y=alt.Y(color, sort=y_order, title=None).axis(offset=5, domainOpacity=0),
        x=alt.X('count()'), 
        text='count():Q'
    )
    plot= alt.layer(bars, text).resolve_scale(color='independent').properties(height=height, width=width)
    return plot


## WGS Exomiser

In [6]:
excluded = variant_table[(variant_table['Exclude?']=='Yes') | (variant_table['Exclude?']=='YES_NEW')][['ID', 'Exclude?', 'Exclude_Reason', 'narrative']].drop_duplicates()
print(len(excluded), 'excluded genes')

included_genes = variant_table[(variant_table['Exclude?']=='No') | (variant_table['Exclude?']=='NO_NEW')][['ID', 'gateway_dx','Exclude?', 'Exclude_Reason', 'narrative', 'for_figure', 'UDN_ID', 'MOI']].drop_duplicates()
print(len(included_genes), 'included genes')
num_indiv = len(set(included_genes['UDN_ID']))
print(num_indiv, 'UDN individuals')


147 excluded genes
239 included genes
231 UDN individuals


In [13]:
included_variants = variant_table[(variant_table['Exclude?']=='No') | (variant_table['Exclude?']=='NO_NEW')]
included_families = included_variants[['UDN_ID', 'family_forfig']].drop_duplicates()

order=['NONE', 'ALPHA_MISSENSE', 'MVP', 'REVEL', 'SPLICE_AI', 'REMM']

In [14]:
variant_order = list(set(list(set(included_variants['variant_type'])) + list(set(genomiser_variants['variant_type']))))
variant_order.sort()

In [15]:
wgs_exomiser_genes = make_graph(included_genes,num_indiv, 'for_figure:N', 'Genes',  narratives,'-x', 150)
wgs_exomiser_variants = make_graph(included_variants, num_indiv, 'variant_type:N', 'Variants', variant_order,'-x', 200,)
wgs_exomiser_families = make_graph(included_families, num_indiv, 'family_forfig:N', 'Families', family_order,family_order, 150, )

In [18]:
wgs_exomiser = alt.vconcat(wgs_exomiser_genes, wgs_exomiser_variants, wgs_exomiser_families)#.configure_axis(grid=False,


In [19]:
wgs_exomiser

## WGS Genomiser

In [20]:
genomiser_subset = variant_table[(variant_table['Exclude?']=='GENOMISER')][['UDN_ID','ID', 'Exclude?', 'Exclude_Reason', 'Fig_Color','narrative', 'MOI', 'Family_Type', 'for_figure']].drop_duplicates()
for_genomiser_ids = list(genomiser_subset['ID'])
print(len(for_genomiser_ids), 'genes reserved for Genomiser analysis')
print(len(variant_table[(variant_table['Exclude?']=='GENOMISER')][['ID', 'Exclude?', 'Exclude_Reason', 'Fig_Color','narrative']]),'variants')

num_indiv = len(set(genomiser_subset['UDN_ID']))
print(num_indiv, 'patients')


39 genes reserved for Genomiser analysis
60 variants
39 patients


In [22]:
genomiser_variants = variant_table[(variant_table['Exclude?']=='GENOMISER')]
print(len(genomiser_variants))
genomiser_families = genomiser_variants[['UDN_ID', 'family_forfig']].drop_duplicates()


60


In [23]:
wgs_genomiser_genes = make_graph(genomiser_subset, num_indiv, 'for_figure:N', 'Genes', narratives,narratives, 150)
wgs_genomiser_variants =make_graph(genomiser_variants, num_indiv, 'variant_type:N', 'Variants',variant_order, '-x', 200)
wgs_genomiser_families = make_graph(genomiser_families, num_indiv, 'family_forfig:N', 'Families', family_order, family_order)


In [24]:
wgs_genomiser = alt.vconcat(wgs_genomiser_genes, wgs_genomiser_variants, wgs_genomiser_families)

In [25]:
wgs_genomiser

## WES Exomiser

In [27]:
wes_data = pd.read_csv('wes_input.tsv', sep='\t',encoding='unicode_escape')
wes_included = wes_data[wes_data['Exclude?']=='No']
wes_included_genes = wes_included[['ID', 'for_figure','family_forfig']].drop_duplicates()
num_indiv_wes = len(set(wes_included['UDN_ID']))
wes_included_families = wes_included[['UDN_ID', 'family_forfig']].drop_duplicates()


In [28]:
wes_genes = make_graph(wes_included_genes, num_indiv_wes, 'for_figure:N', 'Genes', narratives, '-x')
wes_variants = make_graph(wes_included, num_indiv_wes, 'variant_type:N', 'Variants', variant_order, '-x', 200)
wes_families = make_graph(wes_included_families, num_indiv_wes, 'family_forfig:N', 'Families', family_order, family_order)


In [29]:
wes = alt.vconcat(wes_genes, wes_variants, wes_families)


In [30]:
alt.hconcat(wes,wgs_exomiser, wgs_genomiser).configure_axis(grid=False,
    labelPadding= 5,
    labelLimit=0,
    labelFontSize=13, 
    titleFontSize=15, labelFont='arial', tickSize=8).configure_legend(
        labelLimit=0,labelFontSize=12, titleFontSize=15, labelFont='arial')