In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')

In [2]:
import os
import json
import yaml
import numpy as np
import pandas as pd
from pathlib import Path
import tb_gene_signature_pipeline.network_analysis as na

In [3]:
project_dir = Path.cwd().resolve().parent

### Differential Expression Analysis

Generate per-gene differential expression statistics for each dataset and for each case/control group comparison.

In [6]:
differential_expression_results = na.run_differential_expression_analysis()

### Merge Differential Expression Results

Given differential expression statistics across datasets and comparisons, filter the results to include
only genes with `adj_p_val <= 0.05` and effect size `log_fc >= np.log2(1.5)`. Combine filtered log fold
change effect sizes into a ``(gene * n_comparisons)-by-dataset`` dataframe.

In [7]:
merged_results = na.merge_differential_expression_results(
    differential_expression_results, adj_pval_thresh=0.05, log_fc_thresh=np.log2(1.5))

### Construct a gene network

Use the signficant log fold change effect size estimates to construct a network where nodes correspond to genes
and an edge between two nodes represents significant differential expression for each of the two genes in the
same direction (positive or negative) in **at least 3 datasets**.

A separate network is constructed for each comparison group (`hc` vs. `atb`, etc.).

Also computes degree, weighted degree, and eigenvector centrality measures for each node in the networks.

In [8]:
networks = na.construct_networks(merged_results)

### Generate gene lists

From the constructed gene networks, generate lists of genes by intersecting genes found in the top
100 (as measured by weighted degree) in each comparison network.

In [17]:
gene_lists = na.combine_networks_into_lists(networks)

### Results

Inspect the generated gene lists.

In [18]:
for k, v in gene_lists.items():
    print(f'{k}: {len(v)} genes', '\n', v, '\n')

top_genes_in_all_networks: 27 genes 
 ['GK', 'CARD17', 'TIMM10', 'GBP1', 'WARS', 'UBE2L6', 'IFI44', 'KCNJ15', 'CASP5', 'EPSTI1', 'BATF2', 'SERPING1', 'FCGR1B', 'GBP5', 'LHFPL2', 'LAP3', 'SAMD9L', 'CEACAM1', 'VAMP5', 'C1QB', 'RTP4', 'ANKRD22', 'FBXO6', 'PSTPIP2', 'AIM2', 'IFITM3', 'IFIT3'] 

top_genes_not_in_od_network: 21 genes 
 ['ANXA3', 'LY96', 'NAIP', 'CAMP', 'PLSCR1', 'ZNF438', 'BST1', 'S100A12', 'LRRK2', 'FLVCR2', 'SORT1', 'HP', 'JAK2', 'ADM', 'FAM26F', 'CARD16', 'CD274', 'CR1', 'TLR5', 'IFITM1', 'BPI'] 



Compare to Roger's original lists.

In [19]:
this_dir = Path.cwd().resolve().parent

In [27]:
with (this_dir / 'data' / 'roger_top_genes_all_networks.txt').open('r') as f:
    roger_top_genes_in_all_networks = [x.strip() for x in f.readlines()]

In [28]:
with (this_dir / 'data' / 'roger_top_genes_not_in_od_network.txt').open('r') as f:
    roger_top_genes_not_in_od_network = [x.strip() for x in f.readlines()]

In [29]:
print(f'roger_top_genes_all_networks: {len(roger_top_genes_in_all_networks)} genes',
      '\n', roger_top_genes_in_all_networks, '\n')
print(f'roger_top_genes_not_in_od_network: {len(roger_top_genes_not_in_od_network)} genes',
      '\n', roger_top_genes_not_in_od_network, '\n')

roger_top_genes_all_networks: 24 genes 
 ['GBP5', 'IFIT3', 'LAP3', 'VAMP5', 'PSTPIP2', 'C1QB', 'DUSP3', 'CEACAM1', 'WARS', 'XAF1', 'SERPING1', 'GBP4', 'BATF2', 'ANKRD22', 'TLR5', 'SMARCD3', 'ADM', 'AIM2', 'FCGR1B', 'FAM26F', 'EPSTI1', 'GBP1', 'IFITM3', 'ANXA3'] 

roger_top_genes_not_in_od_network: 23 genes 
 ['TNFSF10', 'DYSF', 'IL1B', 'GK', 'LHFPL2', 'TRIM22', 'CLEC4D', 'TNFSF13B', 'PLAUR', 'LY96', 'FBXO6', 'KCNJ15', 'SPOCK2', 'TMEM204', 'IL7R', 'GPR183', 'ID3', 'CD6', 'KLRB1', 'PIK3IP1', 'CCR7', 'NELL2', 'SKAP1'] 



Intersection of top genes in all networks, original and new lists:

In [34]:
all_networks_intersection = list(set(gene_lists['top_genes_in_all_networks']).intersection(set(roger_top_genes_in_all_networks)))

In [35]:
print(f'all networks intersection: {len(all_networks_intersection)} genes', '\n', all_networks_intersection)

all networks intersection: 16 genes 
 ['C1QB', 'SERPING1', 'ANKRD22', 'FCGR1B', 'GBP5', 'PSTPIP2', 'IFITM3', 'LAP3', 'CEACAM1', 'AIM2', 'VAMP5', 'EPSTI1', 'GBP1', 'BATF2', 'WARS', 'IFIT3']


Intersection of top genes not in OD networks, original and new lists:

In [37]:
not_in_od_network_intersection = list(set(gene_lists['top_genes_not_in_od_network']).intersection(set(roger_top_genes_not_in_od_network)))

In [38]:
print(f'not in OD network intersection: {len(not_in_od_network_intersection)} genes', '\n', not_in_od_network_intersection)

not in OD network intersection: 1 genes 
 ['LY96']


Write out gene lists.

In [40]:
with (this_dir / 'data' / 'top_genes_in_all_networks.txt').open('w') as f:
    for gene in gene_lists['top_genes_in_all_networks']:
        f.write(gene + '\n')

In [41]:
with (this_dir / 'data' / 'top_genes_not_in_od_network.txt').open('w') as f:
    for gene in gene_lists['top_genes_not_in_od_network']:
        f.write(gene + '\n')