In [None]:
import pandas as pd
import altair as alt
import sys
sys.path.append('figure_scripts/')

import plot_scripts as ps

In [22]:
variant_table = pd.read_csv('fig_06_variant_table.txt', sep='\t')
pedigree_excluded = variant_table[variant_table['Structure'].notna()]
print(len(set(pedigree_excluded['UDN_ID'])), 'families')
print(len(set(pedigree_excluded['ID'])),'genes')
print(len(pedigree_excluded), 'variants')
present_unaffected_parent_subset = variant_table[variant_table['Structure'].isin(['Present_unaffected_parent', 'Present_unaffected_relative'])]
print('Exomiser: Present_unaffected_indiv')
print(len(set(present_unaffected_parent_subset['UDN_ID'])), 'families')
print(len(set(present_unaffected_parent_subset['ID'])),'genes')
print(len(present_unaffected_parent_subset), 'variants')

print()
absent_affected_parent_subset = variant_table[variant_table['Structure'].isin(['Absent_affected_parent','Absent_affected_relative'])]
print('Exomiser: Absent_affected_indiv')
print(len(set(absent_affected_parent_subset['UDN_ID'])), 'families')
print(len(set(absent_affected_parent_subset['ID'])),'genes')
print(len(absent_affected_parent_subset), 'variants')

parent_issue_ids = list(set(absent_affected_parent_subset['ID'])) + list(set(present_unaffected_parent_subset['ID']))
len(parent_issue_ids)

22 families
23 genes
24 variants
Exomiser: Present_unaffected_indiv
15 families
16 genes
16 variants

Exomiser: Absent_affected_indiv
7 families
7 genes
8 variants


23

In [None]:

success_table = pd.read_csv('supp_fig11_input.tsv', sep='\t')

success_table = success_table[success_table['ID'].isin(parent_issue_ids)]
success_table['UDN_ID'] = [x.split('_')[0] for x in success_table['ID']]
print(len(set(success_table['UDN_ID'])), 'families')
print(len(set(success_table['ID'])),'genes')
print(len(success_table), 'variants')

In [None]:
mapping = pd.read_csv('GS_ID_mapping.csv')
mapped= []
for i, row in success_table.iterrows():
    ID = row['ID']
    try:
        mapped_id = mapping[mapping['ID']==ID]['Dumb_ID'].item()
    except:
        print(ID)
    mapped.append(mapped_id)
success_table['Dumb_ID'] = mapped

## A

In [None]:
exomiser_genes = pedigree_excluded[['UDN_ID', 'gene_name', 'pedigree_exclusion','for_figure', 'Exclude_Reason', 'Structure']]

base = alt.Chart(exomiser_genes, title=str(len(exomiser_genes))+' Variants from ' +str(len(set(pedigree_excluded['UDN_ID'])))+ ' UDN Individuals')
bars=base.mark_bar().encode(
    y=alt.Y('Structure:N', sort=['Absent_affected_parent', 'Absent_affected_relative', 'Present_unaffected_parent', 'Present_unaffected_relative'], title=None).axis(offset=5, domainOpacity=0),
    x=alt.X('count()', title='Number of Diagnostic Variants'), 
    color=alt.Color('for_figure', scale=alt.Scale(scheme='tableau20', domain=['single heterozygous', 'compound heterozygous', 'homozygous'])),
    tooltip = ['count()','Exclude_Reason:N', 'for_figure'],
)

text = base.mark_text(
    align='left',
    baseline='middle',
    dx=3,
    color='black',
    size=14
).encode(
    y=alt.Y('Structure:N',sort=['Absent_affected_parent', 'Absent_affected_relative', 'Present_unaffected_parent', 'Present_unaffected_relative'] , title=None).axis(offset=5, domainOpacity=0),
    x=alt.X('count()'), 
    text='count():Q'
)
data_set = alt.layer(bars, text).resolve_scale(color='independent').properties(height=200, width=300)#.configure_axis(grid=False,
    # labelPadding= 5,
    # labelLimit=0,
    # labelFontSize=15, 
    # titleFontSize=15, labelFont='arial', tickSize=8).configure_legend(
    #     labelLimit=0,labelFontSize=15, titleFontSize=15, labelFont='arial')
data_set


## B

In [None]:

RUN_TYPES = ['noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL', 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_singleton_noWL','noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_altered_ped_noWL']
df, denom =ps.create_df(30, RUN_TYPES, success_table)

plot_4c = ps.create_plot(denom, df, RUN_TYPES, 'set2',False,RUN_TYPES)

plot_4c.properties(width=400, height=250)

## C

In [None]:
normal_run_type = 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noWL'
singleton_run_type = 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_singleton_noWL'
# no_inheritance_run_type = 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_noinheritance_noWL'
altered_ped_run_type = 'noN_filtered_15_85_human_revel_mvp_alphaM_spliceAI_altered_ped_noWL'
data = []
to_plot = []
large=[]

for i, patient_row in success_table.iterrows():
    ID = patient_row['ID']
    dumb_ID = patient_row['Dumb_ID']
    OG_rank = patient_row['Variant_Level_noMOI_rank_' + normal_run_type]
    singleton_rank = patient_row['Variant_Level_noMOI_rank_' + singleton_run_type]
   # no_inheritance_rank = patient_row['Variant_Level_noMOI_rank_' + no_inheritance_run_type]
    alt_rank = patient_row['Variant_Level_noMOI_rank_' + altered_ped_run_type]
    structure = list(variant_table[variant_table['ID']==ID]['Structure'])[0]
    difference = abs(singleton_rank - alt_rank)
    data.append([ID, OG_rank, difference, 'OG'])
    data.append([dumb_ID, singleton_rank, difference, 'Singleton', structure])
  #  data.append([dumb_ID, no_inheritance_rank, difference, 'No_Inheritance', structure])
    data.append([dumb_ID, alt_rank, difference, 'Altered_Ped', structure])


    #if OG_rank != singleton_rank:
    if 'EVI5' not in ID:
        to_plot.append(dumb_ID)
    else:
        large.append(dumb_ID)


df = pd.DataFrame(data, columns=['ID', 'Rank', 'Diff', 'RunType', 'Structure'])
len(to_plot)

In [None]:
df =df.fillna(-10)

order =list(df[df['RunType']=='Singleton'].sort_values(by='Rank')['ID'])
structures=['Absent_affected_parent', 'Absent_affected_relative', 'Present_unaffected_parent', 'Present_unaffected_relative']
order =list(df[df['RunType']=='Singleton'].sort_values(by=['Structure','Rank'])['ID'])
source = df[df['ID'].isin(to_plot)].sort_values(by='Structure', ascending=True)

In [None]:
source = df[df['ID'].isin(to_plot+large)].sort_values(by='Structure', ascending=True)
#order = list(source['ID'])
order=['UDN240_KCNA2', 'UDN141_CFTR', 'UDN141_CFTR', 'UDN120_PPP2R1A', 'UDN264_RPL13', 'UDN327_TEK', 'UDN371_CDK13', 'UDN400_MVK', 'UDN196_NHERF1', 'UDN324_WNT10A', 'UDN395_F5', 'UDN33_PSTPIP1', 'UDN258_CDH1', 'UDN363_RTN2', 'UDN384_PDGFB', 'UDN403_CTNNA1', 'UDN39_ATP1A1', 'UDN395_MME', 'UDN247_TNXB', 'UDN187_FLG', 'UDN321_MCCC2', 'UDN382_DEGS1', 'UDN256_LOX', 'UDN353_EVI5']

print(len(source)/2)
all_types = alt.Chart(source, title=str(len(success_table))+ 'variants run as singletons').mark_point(size=100, filled=True, stroke='black',strokeWidth=1, opacity=1).encode(
    x=alt.X('ID', sort=order, axis=alt.Axis(labelAngle=-60)),
    y=alt.Y('Rank', title='Rank of Diagnostic Variant'),
    #size = 'Term_Diff',
    color=alt.Color('Structure', sort=structures , scale=alt.Scale(scheme='category10', domain=structures)),
    shape = alt.Shape('RunType', sort=['Singleton', 'Altered_Ped', 'No_Inheritance'], scale=alt.Scale(range=['circle', 'triangle'])),
    tooltip = ['RunType', 'ID', 'Rank', 'Diff'],
    
).properties(
    height=250,
    width=500
)#.configure_legend(labelLimit=0).configure_axis(
    # labelFontSize=14,
    # titleFontSize=15).configure_scale(
    # bandPaddingInner=0.9)

all_types

In [None]:
top = alt.hconcat(data_set,plot_4c.properties(width=400, height=250), all_types,).resolve_scale(color='independent')
alt.vconcat(top).configure_legend(labelLimit=0).resolve_scale(color='independent').configure_axis(
    labelPadding= 5,
    labelLimit=0,
    labelFontSize=15, 
    titleFontSize=15, labelFont='arial', tickSize=8).configure_legend(
        labelLimit=0,labelFontSize=15, titleFontSize=15, labelFont='arial')