In [None]:
%load_ext ipy_pdcache

In [None]:
import os
import time

import numpy as np
import pandas as pd

import networkx as nx

import onto2nx
import pybiomart

import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

from gene_map import GeneMapper

# Parameters

In [None]:
disgenet_fname = snakemake.input.disgenet_fname
gwascatalog_fname = snakemake.input.gwascatalog_fname
efo_fname = snakemake.input.efo_fname
so_fname = snakemake.input.so_fname

db_out_fname = snakemake.output.db_fname
raw_veps_fname = snakemake.output.raw_veps

gwas_gene_source = snakemake.config['parameters']['associated_gene_source']
annotation_sources = snakemake.config['annotation_sources']
snp_filters = snakemake.config['snp_filters']

# Load DisGeNET

In [None]:
df_disgenet = pd.read_table(
    disgenet_fname,
    usecols=['snpId', 'diseaseId','diseaseName', 'source'])

df_disgenet['snp_source'] = 'disgenet'
df_disgenet['diseaseIdType'] = 'UMLS_CUI'

In [None]:
df_disgenet.head()

# Load GWAS catalog

## Parse input

In [None]:
df_gwascat = pd.read_table(gwascatalog_fname, low_memory=False)

#df_gwascat = df_gwascat[['SNP_ID_CURRENT', 'MAPPED_TRAIT_URI', 'MAPPED_TRAIT']]
df_gwascat.dropna(subset=['SNP_ID_CURRENT', 'MAPPED_TRAIT_URI', 'MAPPED_TRAIT'], inplace=True)
df_gwascat.rename(columns={
    'SNP_ID_CURRENT': 'snpId', 'MAPPED_TRAIT_URI': 'diseaseId', 'MAPPED_TRAIT': 'diseaseName'
}, inplace=True)

df_gwascat['snpId'] = df_gwascat['snpId'].apply(lambda x: f'rs{x}')
df_gwascat['snp_source'] = 'gwas_catalog'

df_gwascat['diseaseId'] = df_gwascat['diseaseId'].str.split(',')
df_gwascat = df_gwascat.explode('diseaseId')

df_gwascat['diseaseId'] = df_gwascat['diseaseId'].apply(lambda x: x.split('/')[-1])
df_gwascat['diseaseIdType'] = df_gwascat['diseaseId'].apply(lambda x: x.split('_')[0])

# convert BETA to odds ratio
df_gwascat['odds_ratio'] = df_gwascat['OR or BETA'].apply(lambda x: np.exp(x) if x < 1 else x)

df_gwascat.head(1)

## Infer associated gene(s)

Possible columns:
* REPORTED GENE(S): gene reported by author
* MAPPED GENE: Gene(s) mapped to the strongest SNP (if SNP is intergenic uses upstream and downstream genes)
* SNP_GENE_IDS: Entrez Gene ID

In [None]:
df_gwascat[['REPORTED GENE(S)', 'MAPPED_GENE', 'SNP_GENE_IDS']].head()

In [None]:
if gwas_gene_source == 'reported':
    # are gene names, must be mapped to ENTREZ
    raw_genes = df_gwascat['REPORTED GENE(S)'].str.split(', ').tolist()
    
    gene_blacklist = {'intergenic', 'NR'}
    cur_genes = [g for gs in raw_genes if not isinstance(gs, float) for g in gs if g not in gene_blacklist]  # isinstance(gs,float) -> gs==np.nan
    
    gm = GeneMapper()
    df_map = gm.query(id_list=cur_genes, source_id_type='Gene_Name', target_id_type='GeneID')
    name2id = df_map.set_index('ID_from').to_dict()['ID_to']
    
    entrez_genes = [None 
                    if isinstance(gs, float) 
                    else [name2id[g] for g in gs if g in name2id]
                    for gs in raw_genes]
elif gwas_gene_source == 'mapped':
    # are already ENTREZ IDs
    raw_genes = df_gwascat['SNP_GENE_IDS'].str.split(', ').tolist()
    entrez_genes = [None if isinstance(gs, float) else gs for gs in raw_genes]
else:
    raise RuntimeError(f'Invalid gene source: "{gwas_gene_source}"')

In [None]:
df_gwascat['associated_genes'] = [None if gs is None else ','.join(gs) for gs in entrez_genes]
df_gwascat[['REPORTED GENE(S)', 'MAPPED_GENE', 'SNP_GENE_IDS', 'associated_genes']].head()

## Select relevant columns

In [None]:
df_gwascat.columns

In [None]:
df_gwascat = df_gwascat[['diseaseId', 'snpId', 'snp_source', 'diseaseIdType', 'odds_ratio', 'associated_genes']]
df_gwascat.head()

# Combine sources

In [None]:
df = pd.concat([df_gwascat])  #df_disgenet, 
df.head()

# Load required ontologies

In [None]:
%%time

efo_graph = onto2nx.parse_owl(efo_fname)
efo_graph.name = 'efo'
print(nx.info(efo_graph))

In [None]:
%%time

so_graph = onto2nx.parse_owl(so_fname)
so_graph.name = 'so'
print(nx.info(so_graph))

# Label diseases

In [None]:
efo_label_map = {idx: data['label'] for idx, data in efo_graph.nodes(data=True)}

In [None]:
df['diseaseLabel'] = df['diseaseId'].map(efo_label_map)

In [None]:
df.head()

# Cancer classification

In [None]:
nodes_all = list(efo_graph.nodes())

# find all disease-nodes
nodes_disease = list(nx.ancestors(efo_graph, 'EFO_0000408')) + ['EFO_0000408']  # disease subtree (vs traits, ...)

# find all cancer diseases
nodes_cancer = list(nx.ancestors(efo_graph, 'EFO_0000311')) + ['EFO_0000311']  # cancer subtree

# assert nodes_cancer <= nodes_disease
# assert nodes_disease <= nodes_all  # ???
print(f'#cancer/#disease/#all: {len(nodes_cancer)}/{len(nodes_disease)}/{len(nodes_all)}')

In [None]:
tmp = []
for disease in tqdm(df['diseaseId'].unique()):
    if disease in nodes_disease:
        tmp.append({'diseaseId': disease, 'is_cancer': disease in nodes_cancer})
    else:
        tmp.append({'diseaseId': disease, 'is_cancer': np.nan})
    
df_iscancer = pd.DataFrame(tmp)
df_iscancer.head(5)

# SNP annotations

## Retrieve VEP annotations

Variant consequence ontology: http://www.sequenceontology.org/browser/current_release

Description of variant types: https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html

Raw data: ftp://ftp.ensembl.org/pub/release-98/variation/vep/

In [None]:
snps = df['snpId'].unique().tolist()
print(f'Retrieving annotations for {len(snps)} SNPs')

In [None]:
%%time
%%pdcache df_anno_raw $raw_veps_fname

df_list = []
for genome_assembly, annotation_url in annotation_sources.items():
    if os.path.isfile(annotation_url):
        print('Using local SNP annotations')
        tmp = pd.read_csv(annotation_url, index_col=0)
    else:
        print('Retrieving SNP annotations from Ensembl')

        tmp = None
        while tmp is None:
            try:
                server = pybiomart.Server(host=annotation_url)
                dataset = server.marts['ENSEMBL_MART_SNP'].datasets['hsapiens_snp']

                tmp = dataset.query(
                    attributes=['refsnp_id', 'chr_name', 'chrom_start', 'consequence_type_tv', 'ensembl_transcript_stable_id'],
                    filters={'snp_filter': snps},
                    use_attr_names=True)
            except requests.HTTPError:
                # retry if network error occurred
                print('Next try...')
                time.sleep(10)
                
    tmp['genome_assembly'] = genome_assembly
    df_list.append(tmp)
    
df_anno_raw = pd.concat(df_list, ignore_index=True)

In [None]:
df_anno_raw.head()

## Convert annotations to usable format

In [None]:
df_anno = df_anno_raw.copy()

# processing preparations
df_anno['chr_name'] = df_anno['chr_name'].astype(str)

# remove haplotypes (e.g. CHR_HSCHR6_MHC_COX_CTG1)
df_anno = df_anno[~df_anno['chr_name'].str.contains('_')]

# mark empty consequence as 'intergenic' (NaN in dataframe shows up as intergenic in VEP web-interface)
df_anno.loc[df_anno['consequence_type_tv'].isna(), 'consequence_type_tv'] = 'intergenic_variant'

# select most frequent annotations (TODO: handle multiple maxima)
tmp  = []
for (snp, genome_assembly), group in tqdm(df_anno.groupby(['refsnp_id', 'genome_assembly'])):
    top_vep = group['consequence_type_tv'].value_counts().idxmax()
    match = group[group['consequence_type_tv'] == top_vep].iloc[0]
    tmp.append(match)
df_anno = pd.DataFrame(tmp)

# set column names
df_anno.drop('ensembl_transcript_stable_id', axis=1, inplace=True)

df_anno.rename(
    columns={
        'refsnp_id': 'snpId', 'consequence_type_tv': 'variant_type',
        'chr_name': 'chromosome', 'chrom_start': 'position'
    }, inplace=True)

In [None]:
df_anno.head()

## Group variant types

### Read sequence ontology (SO)

In [None]:
exon_subgraph = list(nx.ancestors(so_graph, 'SO_0001791')) + ['SO_0001791']
intron_subgraph = list(nx.ancestors(so_graph, 'SO_0001627')) + ['SO_0001627']
intergenic_subgraph = list(nx.ancestors(so_graph, 'SO_0001628')) + ['SO_0001628']

### Find ontology labels

In [None]:
so_label_map = {data['label']: idx for idx, data in so_graph.nodes(data=True)}

### Classify variants

In [None]:
def classify_vep(vep):
    special_cases = {
        'NMD_transcript_variant': 'exonic',
        'mature_miRNA_variant': 'exonic',
        'splice_region_variant': 'exonic',  # can be either exon or intron
        'non_coding_transcript_variant': 'intronic'
    }
    
    vep_id = so_label_map[vep]
    if vep_id in exon_subgraph:
        assert vep_id not in intron_subgraph and vep_id not in intergenic_subgraph, vep
        return 'exonic'
    elif vep_id in intron_subgraph:
        assert vep_id not in exon_subgraph and vep_id not in intergenic_subgraph, vep
        return 'intronic'
    elif vep_id in intergenic_subgraph:
        assert vep_id not in intron_subgraph and vep_id not in exon_subgraph, vep
        return 'intergenic'
    else:
        return special_cases.get(vep, 'ambiguous')

In [None]:
df_anno['variant_group'] = df_anno['variant_type'].apply(classify_vep)
df_anno['variant_group'].value_counts()

In [None]:
df_anno.head()

## Sanity checks

In [None]:
# assert that all SNPs have been annotated (TODO: make this rigorous)
#assert set(df_anno['snpId'].tolist()) == set(snps), set(snps) - set(df_anno['snpId'].tolist())
assert df_anno is not None
assert df_anno.shape[0] > 0

In [None]:
# assert that all variant types have been grouped
assert df_anno['variant_group'].isna().sum() == 0, df_anno[df_anno.variant_group.isna()].drop_duplicates('variant_type')['variant_type'].tolist()

In [None]:
# assert that variant type groups are reasonable
#assert set(df_anno['variant_group']) <= {'exonic', 'intronic', 'intergenic', 'ambiguous'}, df_anno['variant_group'].unique().tolist()

In [None]:
# statistics
print('#SNPs in database:', df['snpId'].nunique(), f'({len(snps)})')
print('#annotated SNPs:', df_anno['snpId'].nunique())
print('#intersection:', len(set(df['snpId'].tolist()) & set(df_anno['snpId'].tolist())))

## Transform dataset

In [None]:
def dummy_agg(x):
    assert len(x) <= 1
    return x

df_anno_trans = pd.pivot_table(
    df_anno,
    values=['chromosome', 'position', 'variant_type', 'variant_group'],
    index=['snpId'],
    columns=['genome_assembly'],
    aggfunc=dummy_agg
).reset_index()

df_anno_trans.columns = ['_'.join(col).rstrip('_') for col in df_anno_trans.columns.values]

In [None]:
df_anno_trans.head()

# Merge data sources

In [None]:
# initial aggregation
df_final = df.copy()
df_final.shape

In [None]:
# cancer-classification
df_final = df_final.merge(df_iscancer, on='diseaseId')
df_final.shape

In [None]:
# SNP annotation
df_final = df_final.merge(df_anno_trans, how='left')
df_final.shape

In [None]:
df_final.head()

# Apply filters

## General filters

In [None]:
# only keep diseases (and not e.g. traits)
df_final.dropna(subset=['is_cancer'], inplace=True)

In [None]:
df_final.shape

## Variant type filters (only add marker)

In [None]:
for filter_name, filter_query in snp_filters.items():
    for genome_assembly in annotation_sources.keys():
        idx = f'filter_{filter_name}_{genome_assembly}'

        df_final[idx] = False
        if filter_query is None:
            df_final[idx] = True
        else:
            match = df_final.query(filter_query.format(genome_assembly=genome_assembly)).index
            df_final.loc[match, idx] = True

# Save result

In [None]:
df_final.head()

In [None]:
df_final.to_csv(db_out_fname, index=False)