In [None]:
%matplotlib inline

%load_ext autoreload
%autoreload 2
%load_ext ipycache

In [None]:
import os
import json

import numpy as np
import pandas as pd
import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt

import mygene

from tqdm import tqdm_notebook as tqdm

from data_reader import *
from measures import *

In [None]:
pd.set_option('display.max_columns', None)

# fix lock.acquire() stalling of tqdm_notebook
from tqdm import tqdm as tqdm_orig
tqdm_orig.get_lock().locks = []

## Load data

In [None]:
df_gwas = load_gwas()

ppi_graph_list = [
    ('BioGRID', load_biogrid()[0]),
    ('StringDB', load_stringdb()[0]),
    ('BioGRID+StringDB', nx.compose(load_biogrid()[0], load_stringdb()[0]))
]

df_snps = load_snp_data()

In [None]:
# convert GWAS reported genes to ENTREZ IDs
fname = 'cache/gene_id_conversions.json'
if os.path.exists(fname):
    with open(fname) as fd:
        gene_conv = json.load(fd)
else:
    # assemble list of all gene symbols
    all_genes = []
    for row in tqdm(df_gwas.itertuples(), total=df_gwas.shape[0]):
        reported_genes = row._14
        if isinstance(reported_genes, float):  # is NAN
            continue
        all_genes.extend(reported_genes.split(','))
    all_genes = set(all_genes)
    print(f'Found {len(all_genes)} unique gene symbols')
        
    # convert symbols to ENTREZ IDs
    mg = mygene.MyGeneInfo()    
    res = mg.querymany(all_genes, scopes='symbol', species=9606)
    
    gene_conv = {}
    for entry in tqdm(res):
        if 'notfound' in entry and entry['notfound']:
            continue
        if not 'entrezgene' in entry:
            continue

        sym = entry['symbol']
        entrez = entry['entrezgene']

        #assert sym not in gene_conv or entrez == gene_conv[sym], (entry, sym, entrez, gene_conv[sym])
        gene_conv[sym] = entrez
    print(f'Found mapping for {len(gene_conv)} symbols')

    # cache result
    with open(fname, 'w') as fd:
        json.dump(gene_conv, fd)

In [None]:
# get SNP gene ENTREZ id map
sub = df_gwas[['SNP_ID_CURRENT','SNP_GENE_IDS']].copy()
sub['SNP_ID_CURRENT'] = 'rs' + sub['SNP_ID_CURRENT']
sub['GENE_ENTREZ'] = sub['SNP_GENE_IDS'].str.split(',').str[0]

snp_gene_map = sub.set_index('SNP_ID_CURRENT').dropna().to_dict()['GENE_ENTREZ']

entrez = df_snps['SNP_name'].map(snp_gene_map)
df_snps['Gene_entrez'] = pd.to_numeric(entrez, errors='coerce')

In [None]:
# get disease-name mapping
efo_name_map = df_snps.set_index('EFO_term').to_dict()['disease_name']

## Check SNP data

In [None]:
df_snps.head()

In [None]:
df_gwas.head()

## Check network coherence

In [None]:
graph = nx.Graph()
graph.add_edges_from([(1,2),(2,3),(3,4)])

In [None]:
for ns in [(1,2),(1,3), (1,2,4)]:
    print(ns, get_fraction_of_nonisolated_nodes(ns, graph), compute_network_coherence(graph, ns))

## Compute per-disease network coherences

In [None]:
snp_genes = set(df_snps['Gene_entrez'].unique())
snps = set(df_snps['SNP_name'].unique())

print('SNP genes:', len(snp_genes))
print('SNPs:', len(snps))
print()

for name, ppi_graph in ppi_graph_list:
    ppi_genes = set(ppi_graph.nodes())

    print(name)
    print(' > PPI genes:', len(ppi_genes))
    print(' > PPI/SNP gene overlap:', len(ppi_genes & snp_genes))

In [None]:
def compute_nc(sub_df, min_gene_num=4):
    nc_data = []

    # must be (non)cancer only
    assert len(sub_df.is_cancer.unique()) == 1
    is_cancer = sub_df.is_cancer.iloc[0]
    
    # determine TAD-Border relation
    if len(sub_df.is_tad.unique()) == 1: # either only inside or outside
        tad_relation = 'inside' if sub_df.is_tad.iloc[0] else 'outside'
    else:
        tad_relation = 'mixed'
    
    # compute network coherences
    for ppi_name, ppi_graph in tqdm(ppi_graph_list):
        ppi_nodes = ppi_graph.nodes()
        snp_genes = set(df_snps['Gene_entrez'].tolist())

        for term, group in tqdm(sub_df.groupby('EFO_term')):
            genes = set(group['Gene_entrez'].dropna().unique())
            genes_in_ppi = genes & ppi_nodes

            if len(genes_in_ppi) >= min_gene_num:
                nc = compute_network_coherence(
                    ppi_graph, genes_in_ppi,
                    random_nodes=(snp_genes & ppi_nodes)
                )
            else:
                nc = np.nan

            nc_data.append((term, nc, ppi_name, len(genes_in_ppi), genes_in_ppi, len(genes), genes))

    df_nc = pd.DataFrame(
        nc_data,
        columns=[
            'EFO_term', 'network_coherence', 'network_name',
            'ppi_gene_num', 'ppi_genes', 'total_gene_num', 'total_genes'])
    df_nc['is_cancer'] = is_cancer
    df_nc['tad_relation'] = tad_relation
    return df_nc

In [None]:
df_nc_cancer_all = compute_nc(df_snps[df_snps.is_cancer])
df_nc_cancer_tad = compute_nc(df_snps[df_snps.is_cancer & df_snps.is_tad])
df_nc_cancer_notad = compute_nc(df_snps[df_snps.is_cancer & ~df_snps.is_tad])

df_nc_nocancer_all = compute_nc(df_snps[~df_snps.is_cancer])
df_nc_nocancer_tad = compute_nc(df_snps[~df_snps.is_cancer & df_snps.is_tad])
df_nc_nocancer_notad = compute_nc(df_snps[~df_snps.is_cancer & ~df_snps.is_tad])

# concat result
df_nc = pd.concat([
        df_nc_cancer_all, df_nc_cancer_tad, df_nc_cancer_notad,
        df_nc_nocancer_all, df_nc_nocancer_tad, df_nc_nocancer_notad
    ], axis=0)
df_nc['tad_relation'] = df_nc['tad_relation'].astype('category')

In [None]:
# further annotations
disease_has_tad_enrichment = lambda term: df_snps[df_snps['EFO_term']==term].iloc[0]['disease_tad_enriched']
df_nc['disease_has_tad_enrichment'] = df_nc['EFO_term'].apply(disease_has_tad_enrichment)

df_nc['disease_name'] = df_nc['EFO_term'].replace(efo_name_map)

In [None]:
df_nc.to_csv('results/nc_data.tsv', sep='\t', index=False)
df_nc.head()

### Compare NC-values between used networks

In [None]:
df_nc_trans = pd.DataFrame()

for ppi_name, _ in ppi_graph_list:
    df_nc_trans[f'NC_{ppi_name}'] = df_nc.loc[df_nc['network_name']==ppi_name, 'network_coherence'].tolist()

sns.pairplot(df_nc_trans, diag_kind='kde')
plt.savefig('images/nc_network_comparison.pdf')

### Plot result

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(121)
sns.boxplot(x='is_cancer', y='network_coherence', data=df_nc[df_nc.disease_has_tad_enrichment], hue='tad_relation')
plt.title('Disease with TAD enrichment')

plt.subplot(122)
sns.boxplot(x='is_cancer', y='network_coherence', data=df_nc[~df_nc.disease_has_tad_enrichment], hue='tad_relation')
plt.title('Disease without TAD enrichment')

plt.tight_layout()
plt.savefig('images/nc_cancer_tad.pdf')

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(121)
sns.boxplot(x='is_cancer', y='total_gene_num', data=df_nc[df_nc.disease_has_tad_enrichment], hue='tad_relation')
plt.title('Disease with TAD enrichment')

plt.subplot(122)
sns.boxplot(x='is_cancer', y='total_gene_num', data=df_nc[~df_nc.disease_has_tad_enrichment], hue='tad_relation')
plt.title('Disease without TAD enrichment')

plt.tight_layout()
plt.savefig('images/genenum_cancer_tad.pdf')

### Another representation

In [None]:
sns.pointplot(x='tad_relation', y='network_coherence', data=df_nc[df_nc.is_cancer], hue='disease_name')
lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

plt.savefig('images/line_nc.pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')

## Small experiment

In [None]:
%%cache random_nc.pkl df_random --cachedir cache

ppi_graph = ppi_graph_list[0][1] # BioGRID graph

ppi_nodes = list(ppi_graph.nodes())
random_note_sets = [[random.sample(ppi_nodes, k) for _ in range(10)] for k in range(2, 100)]

random_data = []
for ns_list in tqdm(random_note_sets):
    for ns in ns_list:
        nc = compute_network_coherence(ppi_graph, ns)
        random_data.append((nc, len(ns)))
df_random = pd.DataFrame(random_data, columns=['network_coherence', 'node_num'])

In [None]:
df_random.plot(kind='scatter', x='node_num', y='network_coherence')
plt.savefig('images/nc_random_nodes.pdf')