In [31]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../gene-signature-ml-pipeline')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [226]:
import os
import json
import yaml
from subprocess import Popen, PIPE
from pathlib import Path
from network_analysis import *

In [33]:
project_dir = Path.cwd().resolve().parent

In [34]:
with (project_dir / 'config.yml').open('r') as f:
    config = yaml.safe_load(f)

In [35]:
data_dir = Path(config['data_directory'])

In [36]:
with (project_dir / 'data' / 'datasets.json').open('r') as f:
    datasets = json.load(f)

In [37]:
with (project_dir / 'data' / 'platforms.json').open('r') as f:
    platforms = json.load(f)

In [38]:
with (project_dir / 'data' / 'comparisons.json').open('r') as f:
    comparisons = json.load(f)

#### notebook 1

In [80]:
differential_expression_results = run_differential_gene_expression_analysis()

/efs/liam/tb-gene-signature-datasets/network-analysis/convert-to-scripts/differential_expression_results.tsv
          dataset control case  gene_symbol    log_fc  adj_p_val
0        GSE19439      hc  atb         A1BG  0.026045   0.917217
1        GSE19439      hc  atb         A1CF  0.091435   0.468395
2        GSE19439      hc  atb          A2M  0.007356   0.964635
3        GSE19439      hc  atb        A2ML1 -0.171059   0.204401
4        GSE19439      hc  atb      A3GALT2 -0.008727   0.967318
...           ...     ...  ...          ...       ...        ...
948963  GSE107994    ltbi  atb         H4C4  1.143974   0.000019
948964  GSE107994    ltbi  atb  SLC44A3-AS1  0.430325   0.223273
948965  GSE107994    ltbi  atb       SUCNR1  1.003668   0.000224
948966  GSE107994    ltbi  atb   AL161787.1  0.357804   0.156444
948967  GSE107994    ltbi  atb      RPL39P3  0.297987   0.191242

[948968 rows x 6 columns]


#### notebook 2

In [146]:
merged_results = merge_differential_expression_results(
    differential_expression_results, pval_thresh=0.05, log_fc_thresh=np.log2(1.5))

#### notebook 5

In [147]:
merged_results

Unnamed: 0,control,case,gene_symbol,GSE107994,GSE19439,GSE19444,GSE28623,GSE29536,GSE34608,GSE42825,...,GSE83456,GSE84076,GSE107993,GSE101705,GSE19442,GSE37250,GSE39939,GSE39940,GSE69581,GSE73408
0,hc,atb,1060P11.3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hc,atb,A2M-AS1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hc,atb,AAED1,0.0,0.0,0.0,0.0,0.0,0.862417,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hc,atb,AAMDC,0.0,0.0,0.0,0.0,0.0,0.000000,0.694117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hc,atb,AAMP,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12612,od,atb,ZNF438,0.0,0.0,0.0,0.0,0.0,0.000000,0.730323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12613,od,atb,ZNF593,0.0,0.0,0.0,0.0,0.0,0.707623,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12614,od,atb,ZNF785,0.0,0.0,0.0,0.0,0.0,0.000000,1.169437,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12615,od,atb,ZNFX1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
df = merged_results.copy()
df = df.loc[(df['control'] == 'hc') & (df['case'] == 'atb')]

In [149]:
m = df.drop(['control', 'case', 'gene_symbol'], axis=1).values

In [167]:
n_genes = m.shape[0]

In [175]:
n_datasets = (m.sum(axis=0) > 0).sum()

In [150]:
m[m > 0] = 1
m[m == 0] = 0
m[m < 0] = -1

In [191]:
m[9:12,9:12]

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [152]:
m = m.astype(int)

In [205]:
edge_weights = m.dot(m.T)

In [206]:
edge_weights_mask = np.abs(edge_weights) >= 3

In [220]:
edge_indices = zip(*np.triu(edge_weights_mask).nonzero())

In [211]:
gene_symbols = df['gene_symbol']

In [223]:
network_edge_list = [
    (gene_symbols[i], gene_symbols[j], float(edge_weights[i,j])/n_datasets)
    for i, j in edge_indices]

In [224]:
graph = nx.Graph()

In [225]:
graph.add_weighted_edges_from(network_edge_list)

In [None]:
for group in groups:
    _ = construct_network(
        data_dir=data_dir, group=group, qval_thresh=qval_threshold)

#### notebook 6

In [None]:
gene_lists = compare_across_networks(data_dir=data_dir)