In [None]:
import pandas as pd 
import altair as alt
import sys
sys.path.append('figure_scripts/')

import plot_scripts as ps

In [None]:
success_table = pd.read_csv('supp_fig_10_input.tsv', sep='\t')
success_table['UDN_ID'] =[x.split('_')[0] for x in success_table['ID']]
print(len(set(success_table['ID'])),'genes')
print(len(success_table), 'variants')
print(len(set(success_table['UDN_ID'])), 'patients')

In [None]:
mapping = pd.read_csv('GS_ID_mapping.csv')
mapped= []
for i, row in success_table.iterrows():
    ID = row['ID']
    mapped_id = mapping[mapping['ID']==ID]['Dumb_ID'].item()
    mapped.append(mapped_id)
success_table['Dumb_ID'] = mapped

In [None]:
domain = [
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noinheritance_noWL',
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_singleton_noWL',
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noterms_noWL',
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_random_fromHPO_noWL',
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL',
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_singleton_noinheritance_noWL',
                'curated_noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL', 
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_random_fromUDN_add5noWL', 
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_random_fromUDN_add10noWL',
                'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_random_fromUDN_add20noWL']


## A: Affect of randomly sampled phenotypes or no phenotypes

In [None]:
runtypes  = ['noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL', 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_random_fromHPO_noWL','noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noterms_noWL',
 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_random_fromUDN_add5noWL', 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_random_fromUDN_add10noWL','noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_random_fromUDN_add20noWL']

df,denom = ps.create_df(300, runtypes, success_table)
plot_A = ps.create_plot(denom, df, domain, 'category20', False, domain).properties(height=350, width=400)

plot_A.configure_axis(
    labelFontSize=15,
    labelPadding= 5,
    tickSize=8,
    titleFontSize=15,
    labelFont='arial',
    titleFont='arial',
    labelLimit=0
    ).configure_legend(
        labelLimit=0,labelFontSize=13, titleFontSize=15, labelFont='arial', titleFont='arial')


## Pruning

In [None]:
phenotype_data = pd.read_csv('udn_phenotype_data.csv') ##from "prune_term_lists.py"
changed = list(phenotype_data[phenotype_data['Curated_Changed']=='Changed']['ID'])
print(len(changed))
changed.remove('UDN195171')
print(len(changed))

success_table['UDN_ID'] =[x.split('_')[0] for x in success_table['ID']]
trimmed_success_table =  success_table[success_table['UDN_ID'].isin(changed)]
print(len(set(trimmed_success_table['ID'])),'genes')
print(len(trimmed_success_table), 'variants')
print(len(set(trimmed_success_table['UDN_ID'])), 'patients')
ids = list(set(trimmed_success_table['UDN_ID']))

In [None]:

domain = ['Comprehensive', 'Curated', 'Variant']
_range = ['#28993C','lightgrey','#e38cbb']

In [None]:
data = []
removed = []
added = []
for id in set(ids):
    old_terms = set(phenotype_data[phenotype_data['ID']==id]['Terms'].item().split('; '))
    curated_terms = set(phenotype_data[phenotype_data['ID']==id]['Curated_Terms'].item().split('; '))
    
    difference = len(old_terms) - len(curated_terms)
    
    removed_terms = set(old_terms) - set(curated_terms)

    added_terms = set(curated_terms) - set(old_terms)
    added += list(added_terms)
    removed += list(removed_terms)
    data.append([id, len(old_terms), 'Comprehensive', difference, len(removed_terms), len(added_terms)])
    data.append([id, len(curated_terms), 'Curated', difference,len(removed_terms), len(added_terms)])
df_summary = pd.DataFrame(data, columns= ['ID', 'Num_Terms', 'Class', 'Diff', 'Removed', 'Added'])
df_summary

In [None]:
from pyhpo import Ontology, HPOSet
_=Ontology()
print(HPOSet.from_queries(removed))

## B

In [None]:
data2 = []
for term in set(removed):
    try:
       # data2.append([term, removed.count(term), Ontology.get_hpo_object(term).name])
        data2.append([term, removed.count(term), round(((removed.count(term)/len(ids))*100),1), Ontology.get_hpo_object(term).name])
    except:
       # data2.append([term, removed.count(term), 'Unknown_Term'])
        data2.append([term, removed.count(term),round((removed.count(term)/len(ids))*100,2), 'Unknown_Term'])

df2 = pd.DataFrame(data2, columns=['Term', 'numPatients' ,'Percent_Patients','TermName'])

print(len(removed), 'HPO terms used')
print(len(set(removed)), 'unique HPO terms used')

In [None]:
source = df2.sort_values(by='Percent_Patients').sort_values(by='Percent_Patients', ascending=False).reset_index(drop=True)#.head(24)

base = alt.Chart(source, title=str(len(ids)) + ' UDN Individuals ~ Removed HPO Terms')

bars=base.mark_bar().encode(
    y=alt.Y('TermName:N', sort='-x', title=None).axis(offset=5, domainOpacity=0),
    x=alt.X('numPatients', title='Number of Benchmarking Patients with HPO Term Removed'), 
    tooltip = ['numPatients','PhenoCat_IC:N'],
)

text = base.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    color='black',
    size=14
).encode(
    y=alt.Y('TermName:N', sort='-x', title=None).axis(offset=5, domainOpacity=0),
    x=alt.X('numPatients'), 
    text='numPatients:Q'
)
removed_terms_plot = alt.layer(bars).resolve_scale(color='independent').properties(height=350, width=250)

removed_terms_plot

## C

In [None]:
base = alt.Chart(df_summary)
box = base.mark_boxplot().encode(
    alt.Y("Num_Terms:Q", title='Number of HPO terms assigned to proband').scale(zero=False),
    alt.X('Class:N', title=None),
    color=alt.Color('Class:N', scale=alt.Scale(domain=domain, range=_range))
)

rule = base.mark_rule(color='black', strokeWidth=2, strokeDash=[2,2]).encode(
    x='median(Num_Terms):Q',
    tooltip=['median(Num_Terms):Q']
)

text = base.mark_text(
    align='left',
    baseline='middle',
    dx=-5,
    dy=-12,
    color='black',
    size=10
).encode(
    x='Class:N',
    y=alt.Y('median(Num_Terms):Q'), 
    #color='Class:N',
    text='median(Num_Terms):Q'
)
term_boxplot = alt.layer(box, text).properties(height=300, width=100)#.configure_axis(grid=False,
    # labelPadding= 5,
    # labelLimit=0,
    # labelFontSize=12, 
    # titleFontSize=15, labelFont='arial', tickSize=8).configure_legend(
    #     labelLimit=0,labelFontSize=15, titleFontSize=15, labelFont='arial')

term_boxplot

## D

In [None]:
data = []
to_plot = []
to_plot_large = []
to_plot_worse = []

unfiltered_run_type = 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'
filtered_run_type = 'curated_noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'
successes = trimmed_success_table[trimmed_success_table['Variant_Level_noMOI_noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'] == 'Variant_Present_noMOI']

for i, patient_row in successes.iterrows():
    ID = patient_row['Dumb_ID']
    unfiltered_rank = patient_row['Variant_Level_noMOI_rank_' + unfiltered_run_type]
    filtered_rank = patient_row['Variant_Level_noMOI_rank_' + filtered_run_type]
    if filtered_rank == 'N/A':
        filtered_rank = 0
        print(ID, unfiltered_rank)
    difference = unfiltered_rank - filtered_rank
    data.append([ID, unfiltered_rank, difference, 'Comprehensive'])
    data.append([ID, filtered_rank, difference, 'Curated'])

    if unfiltered_rank != filtered_rank:
        if unfiltered_rank > filtered_rank:
            if unfiltered_rank > 20:
                to_plot_large.append(ID)
            else:
                 to_plot.append(ID)
        else:
            to_plot_worse.append(ID)

df = pd.DataFrame(data, columns=['ID', 'Rank', 'Diff', 'RunType'])
print(len(to_plot))



In [None]:


source = df[df['ID'].isin(to_plot)].sort_values(by='Rank', ascending=False)
order = list(source['ID']) + ['UDN197_DENND5B']
print(len(source)/2)
rank_change1 = alt.Chart(source).mark_circle(size=100, filled=True, stroke='black',strokeWidth=1,opacity=1).encode(
    x=alt.X('ID', axis=alt.Axis(labelAngle=-60)).scale(domain=order),
    y=alt.Y('Rank', title='Exomiser Rank of Causative Variant'),#, scale=alt.Scale(domain=[0,29.1])),
    color=alt.Color('RunType', sort=['Unfiltered', 'Filtered'], scale=alt.Scale(domain = domain, range=_range), legend=alt.Legend(
        orient='top',
        direction='horizontal',
        titleAnchor='middle')),
    tooltip = ['RunType', 'ID', 'Rank', 'Diff']
).properties(
    height=200,
    width=400)
source = df[df['ID'].isin(to_plot_large)].sort_values(by='Rank', ascending=False)
print(order)
order = list(source['ID'])
print(len(source)/2)
rank_change2 = alt.Chart(source).mark_circle(size=100, filled=True, stroke='black', strokeWidth=1, opacity=1).encode(
    x=alt.X('ID',  axis=None).scale(domain=order),
    y=alt.Y('Rank', title=None, scale=alt.Scale(domain=[70,73])),
    #size = 'Term_Diff',
    color=alt.Color('RunType', sort=['Curated', 'Comprehensive'], scale=alt.Scale(domain = domain, range=_range), legend=alt.Legend(
        orient='top',
        direction='horizontal',
        titleAnchor='middle')),
    tooltip = ['RunType', 'ID', 'Rank', 'Diff']
).properties(
    height=40, width=400)

rank_improvements = alt.vconcat(rank_change2, rank_change1, spacing=5).resolve_scale(x="shared")
rank_improvements

## E

In [None]:
source = df[df['ID'].isin(to_plot_worse)].sort_values(by='Rank', ascending=False)
order = list(source['ID'])
print(len(source)/2)
rank_worsen1 = alt.Chart(source[source['ID']!= 'UDN57_SPEN']).mark_circle(size=100, filled=True, stroke='black',strokeWidth=1, opacity=1).encode(
    x=alt.X('ID',  axis=alt.Axis(labelAngle=-60)).scale(domain=['UDN26_PIGN','UDN105_SNUPN', 'UDN57_SPEN']),
    y=alt.Y('Rank', title='Exomiser Rank of Causative Variant'),#, scale=alt.Scale(domain=[0,29.1])),
    #size = 'Term_Diff',
    color=alt.Color('RunType', sort=['Curated', 'Comprehensive'], scale=alt.Scale(domain = domain, range=_range), legend=alt.Legend(
        orient='top',
        direction='horizontal',
        titleAnchor='middle')),
    tooltip = ['RunType', 'ID', 'Rank', 'Diff']
).properties(
    height=200,
    width=100
)
rank_worsen2 = alt.Chart(source[source['ID']== 'UDN57_SPEN']).mark_circle(size=100, filled=True, stroke='black', strokeWidth=1, opacity=1).encode(
    x=alt.X('ID', axis=None).scale(domain=['UDN26_PIGN','UDN105_SNUPN', 'UDN57_SPEN']),
    y=alt.Y('Rank', title=None, scale=alt.Scale(domain=[65,69])),
    #size = 'Term_Diff',
    color=alt.Color('RunType', sort=['Curated', 'Comprehensive'], scale=alt.Scale(domain = domain, range=_range), legend=alt.Legend(
        orient='top',
        direction='horizontal',
        titleAnchor='middle')),
    tooltip = ['RunType', 'ID', 'Rank', 'Diff']
).properties(
    height=40, width=100)

rank_worsen = alt.vconcat(rank_worsen2,rank_worsen1, spacing=5).resolve_scale(x="shared")
rank_worsen

## F

In [None]:
scores=[]
trimmed_success_table=trimmed_success_table[trimmed_success_table['Dumb_ID'].isin(to_plot+to_plot_large + to_plot_worse)]
for i, patient_row in trimmed_success_table.iterrows():
    ID = patient_row['Dumb_ID']

    trimmed_score =list(trimmed_success_table[trimmed_success_table['Dumb_ID']==ID]['Variant_Level_noMOI_pheno_curated_noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'])[0]
    gateway_score= list(trimmed_success_table[trimmed_success_table['Dumb_ID']==ID]['Variant_Level_noMOI_pheno_noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'])[0]
    variant_score =list(trimmed_success_table[trimmed_success_table['Dumb_ID']==ID]['Variant_Level_noMOI_variant_curated_noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'])[0]
    trimmed_rank =list(trimmed_success_table[trimmed_success_table['Dumb_ID']==ID]['Variant_Level_noMOI_rank_curated_noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'])[0]
    gateway_rank= list(trimmed_success_table[trimmed_success_table['Dumb_ID']==ID]['Variant_Level_noMOI_rank_noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'])[0]

    old_terms = df_summary[(df_summary['ID'] == patient_row['ID'].split('_')[0]) & (df_summary['Class']=='Comprehensive')]['Num_Terms'].item()
    new_terms = df_summary[(df_summary['ID'] == patient_row['ID'].split('_')[0]) & (df_summary['Class']=='Curated')]['Num_Terms'].item()

    diff = trimmed_score - gateway_score
    scores.append([ID, trimmed_score,'Curated', diff,trimmed_rank, new_terms])
    scores.append([ID, gateway_score,'Comprehensive', diff,gateway_rank, old_terms])
    scores.append([ID, variant_score,'Variant', diff, trimmed_rank,''])

scores_df = pd.DataFrame(scores, columns = ['ID', 'Score', 'Class', 'Diff', 'Rank', 'Terms'])

In [None]:
base = alt.Chart(scores_df)
# order = list(scores_df[scores_df['Class']=='Variant'].sort_values(by='Score')['ID'])
box = base.mark_circle(size=80, filled=True, stroke='black', strokeWidth=1, opacity=1).encode(
    alt.Y("Score:Q", title='Gene Phenotype or Variant Score').scale(zero=False),
    x=alt.X('ID', title='',sort=order, axis=alt.Axis(labelAngle=-60)),
    color=alt.Color('Class:N', scale=alt.Scale(domain=domain, range=_range)),
    tooltip=['Class', 'Rank', 'Score']
)

text = base.mark_text(
    align='left',
    baseline='middle',
    color='black',
    size=10
).encode(
    x='ID:N',
    y=alt.Y('Score:Q'), 
    #color='Class:N',
    text='Rank:Q'
)
phenotype_score_boxplot = alt.layer(box).properties(height=250, width=350)#.configure_axis(grid=False,
    # labelPadding= 5,
    # labelLimit=0,
    # labelFontSize=12, 
    # titleFontSize=15, labelFont='arial', tickSize=8).configure_legend(
    #     labelLimit=0,labelFontSize=15, titleFontSize=15, labelFont='arial')
phenotype_score_boxplot

## G

In [None]:
g_df = scores_df[scores_df['Class']!='Variant']

In [None]:
base = alt.Chart(g_df)
# order = list(scores_df[scores_df['Class']=='Variant'].sort_values(by='Score')['ID'])
box = base.mark_circle(size=80, filled=True, stroke='black', strokeWidth=1, opacity=1).encode(
    alt.Y("Terms:Q", title='Number of HPO Terms').scale(zero=False),
    x=alt.X('ID', title='',sort=order, axis=alt.Axis(labelAngle=-60)),
    color=alt.Color('Class:N', scale=alt.Scale(domain=domain, range=_range)),
    tooltip=['Class', 'Rank', 'Score']
)

text = base.mark_text(
    align='left',
    baseline='middle',
    dx=8,
    color='black',
    size=10
).encode(
    x=alt.X('ID', title='',sort=order, axis=alt.Axis(labelAngle=-60)),
    y=alt.Y('Terms:Q'), 
    #color='Class:N',
    text='Terms:Q'
)
phenotype_term_counts = alt.layer(box,text).properties(height=300, width=500)#.configure_axis(grid=False,
    # labelPadding= 5,
    # labelLimit=0,
    # labelFontSize=12, 
    # titleFontSize=15, labelFont='arial', tickSize=8).configure_legend(
    #     labelLimit=0,labelFontSize=15, titleFontSize=15, labelFont='arial')
phenotype_term_counts

In [52]:
# top = alt.hconcat(plot_4F,plot_4D ).resolve_scale(color='independent')
top = alt.hconcat(plot_A,removed_terms_plot,term_boxplot ).resolve_scale(color='independent')
bottom = alt.hconcat(rank_improvements,rank_worsen,phenotype_score_boxplot).resolve_scale(color='independent')

alt.vconcat(top, bottom,phenotype_term_counts).configure_axis(
    labelFontSize=15,
    labelPadding= 5,
    tickSize=8,
    titleFontSize=15,
    labelFont='arial',
    titleFont='arial',
    labelLimit=0
    ).configure_legend(
        labelLimit=0,labelFontSize=13, titleFontSize=15, labelFont='arial', titleFont='arial')