In [1]:
import pandas as pd
import altair as alt
from pyhpo import Ontology
_= Ontology()

In [5]:

##WGS
variant_table = pd.read_csv('../../02_exomiser_manuscript/supp_fig_02/input.txt', sep='\t')
included_genes = variant_table[(variant_table['Exclude?']=='No') | (variant_table['Exclude?']=='NO_NEW')][['ID', 'Exclude?', 'Exclude_Reason', 'narrative', 'for_figure', 'UDN_ID']].drop_duplicates()
included_genes['UDN_ID'] = [x.split('_')[0] for x in included_genes['ID']]
included_ids = list(set(included_genes['UDN_ID']))
print(len(included_ids))

genomiser_subset = variant_table[(variant_table['Exclude?']=='GENOMISER')][['UDN_ID','ID', 'Exclude?', 'Exclude_Reason', 'Fig_Color','narrative', 'MOI', 'Family_Type', 'for_figure']].drop_duplicates()
genomiser_subset['UDN_ID'] = [x.split('_')[0] for x in genomiser_subset['ID']]

for_genomiser_ids = list(genomiser_subset['UDN_ID'])
print(len(for_genomiser_ids))


###WES
es_variant_table = pd.read_csv('../../UDN_2024/Exomiser_WES/229_variants_phenotips_data_10.01.24_HGVS.tsv', sep='\t',encoding='unicode_escape')
es_included_genes = es_variant_table[es_variant_table['Exclude?']=='No'][['ID', 'Exclude?',  'for_figure', 'UDN_ID']].drop_duplicates()
print(len(es_included_genes), 'ES')
es_included_ids = list(set(es_included_genes['UDN_ID']))
print(len(es_included_ids))

for_figure_ids = list(set(included_ids + for_genomiser_ids + es_included_ids))
print(len(for_figure_ids), 'total individuals')


all_ids = list(included_ids + es_included_ids + for_genomiser_ids)
len(all_ids)
len(set(all_ids))

231
39
130 ES
125
386 total individuals


386

## A/B

In [16]:
def make_graph(data, num_indiv, color, denom_type, x_label, color_order, y_order='-x',height=150, width=300,title=''):
    base = alt.Chart(data, title=title)
    bars=base.mark_bar().encode(
        y=alt.Y(color, sort=y_order, title=None).axis(offset=5, domainOpacity=0),
        x=alt.X('count()', title=x_label), 
        color=alt.Color(color, sort=color_order,legend=None,scale=alt.Scale(scheme='tableau20', domain=color_order)),
        tooltip = ['count()',color],
    )

    text = base.mark_text(
        align='left',
        baseline='middle',
        dx=3,
       # dy=3,
        color='black',
        size=12
    ).encode(
        y=alt.Y(color, sort=y_order, title=None).axis(offset=5, domainOpacity=0),
        x=alt.X('count()'), 
        text='count():Q'
    )
    plot= alt.layer(bars, text).resolve_scale(color='independent').properties(height=height, width=width)
    return plot


In [20]:
es_excluded = pd.read_csv('../../02_exomiser_manuscript/supp_fig_03/ES_excluded_table.csv' )
es_excluded_graph = make_graph(es_excluded, len(es_excluded), 'figure:N', 'Excluded Genes','Number of ES Excluded Genes', list(set(es_excluded['figure'])), '-x',300, 300)
es_excluded_graph


In [21]:
gs_excluded = pd.read_csv('../../02_exomiser_manuscript/supp_fig_03/excluded_table.tsv' , sep='\t')
order = ['Structural_Variant','Tentative_Diagnosis', 'Low_Certainty_Diagnosis', 'Complex_Diagnosis',  'Other','Noncoding_Diagnosis' ]
gs_excluded_graph = make_graph(gs_excluded, len(gs_excluded), 'figure:N', 'Excluded Genes','Number of GS Excluded Genes', list(set(es_excluded['figure'])), '-x',300, 300)
gs_excluded_graph

## C

In [6]:
phenotype_data = pd.read_csv('../../02_exomiser_manuscript/supp_fig_03/supp_fig1_input.csv')
phenotype_data=phenotype_data[phenotype_data['ID'].isin(for_figure_ids)]


In [7]:
term_dict = {}
data = []
all_terms = []
patient_ids = list(phenotype_data['ID'])
for patient in patient_ids:
    patient_data = phenotype_data[phenotype_data['ID']==patient]
    terms = patient_data['Terms_Gateway'].item().split('; ')
    phenotype_category = patient_data['Primary_Symptom_Category'].item()
    term_dict[patient] = terms
    data.append([patient, len(terms), phenotype_category])
    all_terms += terms
df= pd.DataFrame(data, columns=['Patient', 'Number_Gateway_Terms', 'Primary_Symptom_Category'])


In [8]:
base = alt.Chart(df, title=str(len(patient_ids)) + ' UDN Individuals Assigned HPO Terms')

bar = base.mark_bar().encode(
    x=alt.X('Number_Gateway_Terms:Q', title='Number of Terms Assigned in Gateway',bin =alt.BinParams(maxbins=50)),#steps = [1,2,3,4,5,6,7,8,9,10,11])),

    y=alt.Y('count()', title='Number of UDN Individuals'),
    #color=alt.Color('VariationType:N',scale=alt.Scale(scheme='paired'), sort=['singleton', 'duo', 'trio', 'quad', 'quintent', 'sextet']),
   # tooltip = ['PatientID:N', 'count()', 'human_highest_rank']
)#.interactive()

rule = base.mark_rule(color='black', strokeWidth=2, strokeDash=[2,2]).encode(
    x='median(Number_Gateway_Terms):Q',
    tooltip=['median(Number_Gateway_Terms):Q']
)


text = base.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    color='black',
    size=17
).encode(
    y=alt.value(-10),
    x=alt.X('median(Number_Gateway_Terms):Q'), 
    text='median(Number_Gateway_Terms):Q'
)

term_distrib = alt.layer(bar,text, rule).properties(height=300 , width=400)
term_distrib

## D

In [11]:
data2 = []
for term in set(all_terms):
    try:
       # data2.append([term, all_terms.count(term), Ontology.get_hpo_object(term).name])
        data2.append([term, all_terms.count(term), round(((all_terms.count(term)/len(patient_ids))*100),1), Ontology.get_hpo_object(term).name])
    except:
       # data2.append([term, all_terms.count(term), 'Unknown_Term'])
        data2.append([term, all_terms.count(term),round((all_terms.count(term)/len(patient_ids))*100,2), 'Unknown_Term'])

df2 = pd.DataFrame(data2, columns=['Term', 'numPatients' ,'Percent_Patients','TermName'])

print(len(all_terms), 'HPO terms used')
print(len(set(all_terms)), 'unique HPO terms used')

source = df2.sort_values(by='Percent_Patients').sort_values(by='Percent_Patients', ascending=False).reset_index(drop=True).head(20)

base = alt.Chart(source, title=str(len(patient_ids)) + ' UDN Individuals ~ Frequent HPO Terms')

bars=base.mark_bar().encode(
    y=alt.Y('TermName:N', sort='-x', title=None).axis(offset=5, domainOpacity=0),
    x=alt.X('numPatients', title='Number of Benchmarking Patients Assigned HPO Term'), 
   # color=alt.Color('TermName:N',sort='-x',legend=None, scale=alt.Scale(scheme='tableau20')),
   #,scale=alt.Scale(domain=domain, range=_range)),
    tooltip = ['count()','PhenoCat_IC:N'],
)

text = base.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    color='black',
    size=12
).encode(
    y=alt.Y('TermName:N', sort='-x', title=None).axis(offset=5, domainOpacity=0),
    x=alt.X('numPatients', title='Number of Benchmarking Patients Assigned HPO Term'), 
    text='numPatients:Q'
)
common_terms = alt.layer(bars, text).resolve_scale(color='independent').properties(height=300, width=400)#.configure_axis(grid=False,
    # labelPadding= 5,
    # labelLimit=0,
    # labelFontSize=15, 
    # titleFontSize=17, labelFont='arial', tickSize=8).configure_view(strokeOpacity=0).configure_title(fontSize=15)
common_terms

9451 HPO terms used
2464 unique HPO terms used


## E

In [10]:

base = alt.Chart(df, title=str(len(patient_ids)) + ' UDN Individuals Primary Symptom Category')

bars=base.mark_bar().encode(
    y=alt.Y('Primary_Symptom_Category', sort='-x', title=None).axis(offset=5, domainOpacity=0),
    x=alt.X('count()'), 
    color=alt.Color('Primary_Symptom_Category:N',sort='-x',legend=None, scale=alt.Scale(scheme='tableau20')),
   #,scale=alt.Scale(domain=domain, range=_range)),
    tooltip = ['count()'],
)

text = base.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    color='black',
    size=12
).encode(
    y=alt.Y('Primary_Symptom_Category:N', sort='-x', title=None).axis(offset=5, domainOpacity=0),
    x=alt.X('count()', title='Number of UDN Individuals'), 
    text='count():Q'
)
cat_distrib= alt.layer(bars, text).resolve_scale(color='independent').properties(height=300, width=400)#.configure_axis(grid=False,
    # labelPadding= 5,
    # labelLimit=0,
    # labelFontSize=15, 
    # titleFontSize=17, labelFont='arial', tickSize=8).configure_view(strokeOpacity=0).configure_title(fontSize=15)
cat_distrib

In [22]:
top_panel = alt.hconcat(excluded_graph, es_excluded_graph, term_distrib)
bottom_panel = alt.hconcat(common_terms,cat_distrib)

alt.vconcat(top_panel, bottom_panel).configure_axis(grid=False,
    labelPadding= 5,
    labelLimit=0,
    labelFontSize=15, 
    titleFontSize=17, labelFont='arial', tickSize=8).configure_view(strokeOpacity=0).configure_title(fontSize=15)