In [1]:
import floweaver, ipysankeywidget
%run ../__init__.ipynb

In [2]:
df_clinvar_ = read_clinvar(nrows=None)
df_ = merge_missense(df_clinvar_, variant_col='variant_id', pdockq=.23, pocket_score=800).query('am_class == am_class')
printlen(df_, 'ClinVar variants merged with mechanistic annotation')

read_clinvar: 1,857,854 rows from /cluster/work/beltrao/jjaenes/23.06.02_clinvar/24.04.22_protvar_out/clinvar_mapped.tsv
read_clinvar: 1,761,408 after removing conflicting/other variants
merge_missense: 1,761,408 raw records


In [None]:
cols_ = ['clinvar_label', 'am_class', 'mechanistic_label']
flows_ = df_[cols_].groupby(cols_).size().reset_index().rename({0: 'value',
    'clinvar_label': 'source',
    'am_class': 'type',
    'mechanistic_label': 'target',
}, axis=1)
flows_
#ipysankeywidget.SankeyWidget(links=flows_.to_dict('records'))

Unnamed: 0,source,type,target,value
0,Benign,ambiguous,Interface,308
1,Benign,ambiguous,Pockets,689
2,Benign,ambiguous,Stability,809
3,Benign,ambiguous,Unassigned,2874
4,Benign,benign,Interface,3154
5,Benign,benign,Pockets,9135
6,Benign,benign,Stability,2949
7,Benign,benign,Unassigned,55348
8,Benign,pathogenic,Interface,442
9,Benign,pathogenic,Pockets,868


In [None]:
clinvar_part_ = floweaver.Partition.Simple('source', ['Benign', 'VUS', 'Pathogenic'])
#pathogenicity_part_ = floweaver.Partition.Simple('type', ['Pred. benign', 'Pred. ambigous', 'Pred. pathogenic'])
pathogenicity_part_ = floweaver.Partition.Simple('type', ['benign', 'ambiguous', 'pathogenic'])
mechanistic_part_ = floweaver.Partition.Simple('target', ['Unassigned', 'Stability', 'Interface', 'Pockets'])

nodes_ = {
    'clinvar_col': floweaver.ProcessGroup(selection=flows_['source'].unique().tolist(), title='Known annotations', partition=clinvar_part_),
    'pathogenicity_col': floweaver.Waypoint(partition=pathogenicity_part_, title='AlphaMissense'),
    'mechanistic_col': floweaver.ProcessGroup(selection=flows_['target'].unique().tolist(), title='Structural mechanisms', partition=mechanistic_part_),
}

#nodes_['mechanistic_col'].partition = floweaver.Partition.Simple('process', flows_['target'].unique().tolist())
#nodes_['mechanistic_col'].partition = floweaver.Partition.Simple('process', ['Unassigned', 'Stability', 'Interface', 'Pockets'])

ordering_ = [['clinvar_col'], ['pathogenicity_col'], ['mechanistic_col'],]

bundles_ = [floweaver.Bundle('clinvar_col', 'mechanistic_col', waypoints=['pathogenicity_col']),]

class AdhocScale_(floweaver.QuantitativeScale):
    # https://matplotlib.org/stable/gallery/color/named_colors.html#css-colors
    def get_color(self, link, value):
        if link.target == 'mechanistic_col^Unassigned':
            return matplotlib.colors.CSS4_COLORS['lightgrey']
        elif link.target == 'mechanistic_col^Stability':
            return matplotlib.colors.CSS4_COLORS['red']
        elif link.target == 'mechanistic_col^Interface':
            return matplotlib.colors.CSS4_COLORS['orange']
        elif link.target == 'mechanistic_col^Pockets':
            return matplotlib.colors.CSS4_COLORS['green']
        elif link.type == 'benign':
            return matplotlib.colors.CSS4_COLORS['lightblue']
        elif link.type == 'pathogenic':
            return matplotlib.colors.CSS4_COLORS['lightcoral']
        return matplotlib.colors.CSS4_COLORS['silver']

sdd_ = floweaver.SankeyDefinition(nodes_, bundles_, ordering_, flow_partition=pathogenicity_part_)
floweaver.weave(sankey_definition=sdd_, dataset=flows_, link_color=AdhocScale_('value')).to_widget(width=600, height=400).auto_save_svg('clinvar_mechanisms_.svg')

SankeyWidget(groups=[{'id': 'clinvar_col', 'type': 'process', 'title': 'Known annotations', 'nodes': ['clinvar…

In [None]:
# Number of ClinVar variants annotated/predicted as benigin/pathogenic
printlen(df_, 'ClinVar variants mapped')
printlenq(df_, 'clinvar_label == "Pathogenic"',  'annotated as pathogenic')
printlenq(df_, 'clinvar_label == "Benign"',  'annotated as benign')
printlenq(df_, 'am_class == "pathogenic"',  'predicted as pathogenic')
printlenq(df_, 'am_class == "benign"',  'predicted as benign')

987,413 ClinVar variants mapped
40,938 of 987,413 (4.15%) annotated as pathogenic
81,005 of 987,413 (8.20%) annotated as benign
243,961 of 987,413 (24.71%) predicted as pathogenic
645,532 of 987,413 (65.38%) predicted as benign


In [None]:
# Distribution of mechanisms attributed to ClinVar variants predicted as pathogenic by AlphaMissense
df_.query('am_class == "pathogenic"')['mechanistic_label'].value_counts(normalize=True) * 100

Stability     39.515332
Unassigned    36.418936
Pockets       14.843766
Interface      9.221966
Name: mechanistic_label, dtype: float64