In [None]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

from data_reader import *
from measures import *

In [None]:
pd.set_option('display.max_columns', None)

# fix lock.acquire() stalling of tqdm_notebook
from tqdm import tqdm as tqdm_orig
tqdm_orig.get_lock().locks = []

## Load data

In [None]:
df_gwas = load_gwas()
ppi_graph, df_ppi = load_biogrid()

df_snps = load_snp_data()

In [None]:
# get SNP gene ENTRZ id map
sub = df_gwas[['SNP_ID_CURRENT','SNP_GENE_IDS']].copy()
sub['SNP_ID_CURRENT'] = 'rs' + sub['SNP_ID_CURRENT']
sub['GENE_ENTREZ'] = sub['SNP_GENE_IDS'].str.split(',').str[0]

snp_gene_map = sub.set_index('SNP_ID_CURRENT').dropna().to_dict()['GENE_ENTREZ']

entrez = df_snps['SNP_name'].map(snp_gene_map)
df_snps['Gene_entrez'] = pd.to_numeric(entrez, errors='coerce')

## Check SNP data

In [None]:
df_snps.head()

## Check network coherence

In [None]:
graph = nx.Graph()
graph.add_edges_from([(1,2),(2,3),(3,4)])

In [None]:
for ns in [(1,2),(1,3), (1,2,4)]:
    print(ns, get_fraction_of_nonisolated_nodes(ns, graph), compute_network_coherence(graph, ns))

## Compute per-disease network coherences

In [None]:
ppi_genes = set(ppi_graph.nodes())
snp_genes = set(df_snps['Gene_entrez'].unique())
snps = set(df_snps['SNP_name'].unique())

print('PPI genes:', len(ppi_genes))
print('SNP genes:', len(snp_genes))
print('SNPs:', len(snps))
print('PPI/SNP gene overlap:', len(ppi_genes & snp_genes))

In [None]:
def compute_nc(sub_df):
    nc_data = []

    # must be (non)cancer/(non)tad only
    assert len(sub_df.is_cancer.unique()) == 1
    is_cancer = sub_df.is_cancer.iloc[0]
    assert len(sub_df.is_tad.unique()) == 1
    is_tad = sub_df.is_tad.iloc[0]
    
    # compute network coherences
    ppi_nodes = ppi_graph.nodes()
    for term, group in tqdm(sub_df.groupby('EFO_term')):
        genes = set(group['Gene_entrez'].unique())
        genes_in_ppi = genes & ppi_nodes

        if len(genes_in_ppi) > 0:
            nc = compute_network_coherence(ppi_graph, genes_in_ppi)
        else:
            nc = np.nan

        nc_data.append((term, nc, len(genes_in_ppi), len(genes)))

    df_nc = pd.DataFrame(nc_data, columns=['EFO_term', 'network_coherence', 'ppi_gene_num', 'total_gene_num'])
    df_nc['is_cancer'] = is_cancer
    df_nc['is_tad'] = is_tad
    return df_nc

In [None]:
df_nc_cancer_tad = compute_nc(df_snps[df_snps.is_cancer & df_snps.is_tad])
df_nc_cancer_notad = compute_nc(df_snps[df_snps.is_cancer & ~df_snps.is_tad])
df_nc_nocancer_tad = compute_nc(df_snps[~df_snps.is_cancer & df_snps.is_tad])
df_nc_nocancer_notad = compute_nc(df_snps[~df_snps.is_cancer & ~df_snps.is_tad])

df_nc = pd.concat([df_nc_cancer_tad, df_nc_cancer_notad, df_nc_nocancer_tad, df_nc_nocancer_notad], axis=0)

In [None]:
df_nc.head()

## Plot result

In [None]:
sns.boxplot(x='is_cancer', y='network_coherence', data=df_nc, hue='is_tad')
plt.tight_layout()
plt.savefig('images/nc_cancer_tad.pdf')

In [None]:
sns.boxplot(x='is_cancer', y='total_gene_num', data=df_nc, hue='is_tad')
plt.tight_layout()
plt.savefig('images/genenum_cancer_tad.pdf')