In [None]:
import os

import numpy as np
import pandas as pd

import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt

import pyensembl
from gene_map import GeneMapper

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm_notebook as tqdm

from bioinf_common.data_reader import load_goterms, load_stringdb, load_biogrid
from bioinf_common.algorithms import SetEnrichmentComputer, compute_network_coherence

from utils import load_config

In [None]:
config = load_config()

In [None]:
sns.set_context('talk')
pd.set_option('display.max_columns', 99)

# Load data

## SNP-disease associations

In [None]:
results_dir = config['output_dirs']['results']
df_snpdb = pd.read_table(f'{results_dir}/snpdb_enhanced.tsv')
df_snpdb.head()

## TAD-border enrichments

In [None]:
results_dir = config['output_dirs']['results']
df_enr = pd.read_csv(f'{results_dir}/TAD_enrichment.csv')

# mark cancer diseases
iscancer_map = df_snpdb[['diseaseId', 'is_cancer']].set_index('diseaseId').to_dict()['is_cancer']
df_enr['is_cancer'] = df_enr['disease'].map(iscancer_map)

# add disease name
disname_map = df_snpdb[['diseaseId', 'diseaseName']].set_index('diseaseId').to_dict()['diseaseName']
df_enr['disease_name'] = df_enr['disease'].map(disname_map)

# consider only one border type
df_enr = df_enr[df_enr['TAD_type'] == '20in']

df_enr.sample(5)

## PPI networks

In [None]:
ppi_source = config['parameters']['ppi_source']
ppi_source

In [None]:
if ppi_source == 'stringdb':
    ppi_graph, _ = load_stringdb()
elif ppi_source == 'biogrid':
    ppi_graph, _ = load_biogrid()
else:
    raise RuntimeError(f'Invalid PPI source: "{ppi_source}"')

In [None]:
print(nx.info(ppi_graph))

## GO-terms

In [None]:
pathway_groupings = load_goterms()

In [None]:
ensembl = pyensembl.EnsemblRelease(release=93)
all_genes = ensembl.genes()
protein_coding_genes_ensembl = set([g.gene_id for g in all_genes if g.is_protein_coding])

In [None]:
# convert Ensembl to Entrez IDs
gm = GeneMapper()
df_map = gm.query(id_list=protein_coding_genes_ensembl, source_id_type='Ensembl', target_id_type='GeneID')
ensembl2entrez_map = df_map.set_index('ID_from').to_dict()['ID_to']

protein_coding_genes = set([ensembl2entrez_map[g] for g in protein_coding_genes_ensembl if g in ensembl2entrez_map])

In [None]:
print(f'{len(all_genes)} -protein_coding-> {len(protein_coding_genes_ensembl)} -entrez_id-> {len(protein_coding_genes)}')

In [None]:
sec = SetEnrichmentComputer(pathway_groupings, protein_coding_genes)

# Computations

In [None]:
pvalue_threshold = 0.05

In [None]:
cache_dir = config['output_dirs']['cache']
fname = os.path.join(cache_dir, 'disease_computations.csv')

if os.path.exists(fname):
    print('Cached', fname)
    df_data = pd.read_csv(fname)
else:
    ppi_nodes = set(ppi_graph.nodes())

    data = []
    for disease, group in tqdm(df_snpdb.groupby('diseaseId')):
        genes = set(group['associated_genes'].dropna().tolist())

        # network coherence
        nc = compute_network_coherence(ppi_graph, genes & ppi_nodes)

        # enrichment
        tmp = sec.get_terms(genes, throw_on_assert=False)
        tmp = tmp[tmp['p_value_adj'] < pvalue_threshold]
        pval_sum = tmp['p_value_adj'].apply(lambda x: -np.log10(x)).sum()

        data.append((disease, nc, pval_sum))

    df_data = pd.DataFrame(data, columns=['disease', 'network_coherence', 'pval_sum'])
    df_data.to_csv(fname, index=False)

## Basic computed data statistics

In [None]:
df_data.head()

In [None]:
nan_count = df_data['network_coherence'].isna().sum()
print(f'{nan_count}/{df_data["disease"].unique().size} diseases have NaN network coherence')

# Merge data sources

In [None]:
disease2cancer_map = df_snpdb[['diseaseId', 'is_cancer']].set_index('diseaseId').to_dict()['is_cancer']

In [None]:
df = df_data.merge(df_enr[['disease', 'pval_boundary']], on='disease', validate='one_to_one')
df.set_index('disease', inplace=True)

df.head()

# Save result

In [None]:
results_dir = config['output_dirs']['results']
df.to_csv(f'{results_dir}/disease_classification.csv')

# Plots

In [None]:
images_dir = config['output_dirs']['images']

In [None]:
df['neglog_pval_boundary'] = df['pval_boundary'].apply(lambda x: -np.log10(x))
df.drop(columns=['pval_boundary'], inplace=True)

## Compare distributions

In [None]:
df.dropna().corr()

In [None]:
g = sns.PairGrid(df.dropna(), height=4)

g.map_diag(sns.distplot, kde=False)
g.map_offdiag(sns.scatterplot)

g.savefig(f'{images_dir}/classification_distributions.pdf')

### Focus on NC versus TAD-boundary enrichment

In [None]:
tmp = df.dropna().copy()
tmp['is_cancer'] = tmp.index.map(disease2cancer_map)

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    x='network_coherence', y='neglog_pval_boundary', hue='is_cancer', data=tmp,
    alpha=1)

plt.axhline(y=-np.log10(pvalue_threshold), color='grey', linestyle='dashed')

plt.tight_layout()
plt.savefig(f'{images_dir}/nc_vs_boundary.pdf')

## Clustering

In [None]:
df_nona = df.dropna()
df_scaled = StandardScaler().fit_transform(df_nona)

### PCA

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df_scaled)

In [None]:
df_pca = pd.DataFrame(X_pca, index=df_nona.index, columns=['PCA_0', 'PCA_1'])
df_pca['is_cancer'] = df_pca.index.map(disease2cancer_map)
df_pca.head()

In [None]:
plt.figure(figsize=(10,7))
sns.scatterplot(x='PCA_0', y='PCA_1', hue='is_cancer', data=df_pca)

plt.xlabel(round(pca.explained_variance_ratio_[0], 2))
plt.ylabel(round(pca.explained_variance_ratio_[1], 2))

plt.title('PCA (% of variance)')

plt.tight_layout()
plt.savefig(f'{images_dir}/disease_pca.pdf')

### tSNE

In [None]:
tsne = TSNE(n_components=2, verbose=1)
X_tsne = tsne.fit_transform(df_scaled)

In [None]:
df_tsne = pd.DataFrame(X_tsne, index=df_nona.index, columns=['tSNE_0', 'tSNE_1'])
df_tsne['is_cancer'] = df_tsne.index.map(disease2cancer_map)
df_tsne.head()

In [None]:
plt.figure(figsize=(10,7))
sns.scatterplot(x='tSNE_0', y='tSNE_1', hue='is_cancer', data=df_tsne)
plt.title('tSNE')

plt.tight_layout()
plt.savefig(f'{images_dir}/disease_tsne.pdf')