## Simulations to verify network regularization

* Take genes from TCGA pancancer dataset with top $k$ coefficients and connect them in a network (unweighted or randomly weighted)
* Baseline: network from $k$ random genes in dataset
* Could subset data features to ($k$ top coefficients + $k$ random genes)

In [24]:
import os
import sys; sys.path.append('..')
import numpy as np
import pandas as pd
import networkx as nx

import config as cfg
np.random.seed(cfg.default_seed)

In [25]:
tcga_train_df = pd.read_csv(cfg.rnaseq_train, index_col=0, sep='\t')
tcga_train_df.head()

Unnamed: 0_level_0,1,10,100,1000,10000,10001,10002,10003,100037417,10004,...,9987,9988,9989,999,9990,9991,9992,9993,9994,9997
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-LL-A73Z-01,202.0,28.5,329.0,84.5,492.0,448.0,4.59,14.7,337.0,129.0,...,3430.0,717.0,1800.0,6360.0,299.0,2310.0,10.6,3190.0,337.0,892.0
TCGA-55-8207-01,77.5,22.5,74.5,13.1,784.0,333.0,2.54,176.0,153.0,68.3,...,6050.0,923.0,2490.0,11300.0,1150.0,4030.0,9.08,2890.0,316.0,301.0
TCGA-FF-A7CR-01,152.0,0.0,3020.0,26.6,486.0,497.0,0.0,8.47,348.0,91.6,...,4930.0,897.0,861.0,39.7,464.0,3320.0,0.0,1330.0,606.0,558.0
TCGA-BK-A13C-11,80.5,40.0,70.6,284.0,2420.0,325.0,1.2,91.4,231.0,241.0,...,3890.0,737.0,1410.0,10.9,1120.0,1990.0,5.24,3090.0,673.0,263.0
TCGA-EB-A6L9-06,319.0,0.0,422.0,184.0,423.0,392.0,0.945,2.36,585.0,143.0,...,1930.0,328.0,1340.0,7010.0,450.0,563.0,10.9,3780.0,37.3,1120.0


In [26]:
raw_results = os.path.join(cfg.results_dir, 
                           'canonical_pathways',
                           'mutation',
                           'TP53')
                           
raw_coefficients_df = pd.read_csv(os.path.join(raw_results,
                                               'TP53_raw_coefficients.tsv.gz'),
                                  sep='\t')
raw_coefficients_df = raw_coefficients_df.loc[raw_coefficients_df['signal'] == 'signal']
raw_coefficients_df.head(n=15)

Unnamed: 0,feature,weight,abs,signal,z_dim,seed,algorithm,gene
0,1643,-0.23002,0.23002,signal,8000,42,raw,TP53
1,64782,-0.22887,0.22887,signal,8000,42,raw,TP53
2,51065,-0.20935,0.20935,signal,8000,42,raw,TP53
3,7508,-0.14826,0.14826,signal,8000,42,raw,TP53
4,4193,-0.14346,0.14346,signal,8000,42,raw,TP53
5,1026,-0.14096,0.14096,signal,8000,42,raw,TP53
6,9526,-0.1309,0.1309,signal,8000,42,raw,TP53
7,581,-0.10379,0.10379,signal,8000,42,raw,TP53
8,2232,-0.10246,0.10246,signal,8000,42,raw,TP53
9,23612,-0.083962,0.083962,signal,8000,42,raw,TP53


In [27]:
top_feats = raw_coefficients_df.iloc[:10, :].feature.values
top_weights = raw_coefficients_df.iloc[:10, :].weight.values
random_feats = np.random.choice(raw_coefficients_df.feature.values,
                                size=top_feats.shape[0])

def all_same_sign(ar):
    return np.all(ar > 0) if ar[0] > 0 else np.all(ar < 0)

assert all_same_sign(top_weights)

In [28]:
# unweighted network from top features (and random features)
n1 = top_feats.shape[0]
n2 = random_feats.shape[0]
adj_uw = np.block([
            [np.ones((n1, n1)), np.zeros((n1, n2))],
            [np.zeros((n2, n1)), np.zeros((n2, n2))]
])
# weighted network with random weights in [0.5, 1)
adj_w = np.block([
            [np.random.uniform(low=0.5, size=(n1, n1)), np.zeros((n1, n2))],
            [np.zeros((n2, n1)), np.zeros((n2, n2))]
])
adj_random = np.random.uniform(size=(n1+n2, n1+n2))
nodelist = np.concatenate((top_feats, random_feats))

In [29]:
tcga_subset = tcga_train_df.loc[:, nodelist]
tcga_subset.head()

Unnamed: 0_level_0,1643,64782,51065,7508,4193,1026,9526,581,2232,23612,2935,6850,23370,6222,5834,7058,197335,162394,25803,5064
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
TCGA-LL-A73Z-01,396.0,509.0,1690.0,1280.0,1090.0,1130.0,1210.0,1030.0,260.0,1690.0,2820.0,801.0,921.0,30100.0,4810.0,3260.0,487.0,614.0,3950.0,731.0
TCGA-55-8207-01,360.0,789.0,1070.0,1360.0,1610.0,4470.0,1130.0,739.0,321.0,2050.0,2890.0,1190.0,1350.0,32100.0,7420.0,5880.0,645.0,1620.0,379.0,555.0
TCGA-FF-A7CR-01,571.0,1670.0,965.0,1100.0,2500.0,1970.0,1170.0,1840.0,347.0,76.2,4380.0,9560.0,1570.0,38400.0,1170.0,156.0,528.0,551.0,0.0,42.7
TCGA-BK-A13C-11,409.0,310.0,1060.0,1200.0,1190.0,801.0,461.0,391.0,304.0,863.0,2570.0,330.0,1420.0,19900.0,4660.0,4770.0,347.0,587.0,5.77,1840.0
TCGA-EB-A6L9-06,394.0,1200.0,614.0,722.0,244.0,676.0,1660.0,1250.0,393.0,693.0,1580.0,1570.0,2910.0,45900.0,18100.0,1650.0,110.0,899.0,0.945,131.0


In [30]:
def save_numpy_to_el(adj, nodelist, filename):
    G = nx.from_numpy_matrix(adj)
    G = nx.relabel_nodes(G, {ix: n for ix, n in enumerate(nodelist)})
    nx.write_weighted_edgelist(G, filename, delimiter='\t')
    
if not os.path.exists(cfg.networks_dir):
    os.makedirs(cfg.networks_dir)
    
tcga_subset.to_csv(os.path.join(cfg.data_dir, 'tcga_train_sim_subset.tsv'), float_format='%.4f', sep='\t')
save_numpy_to_el(adj_uw, nodelist, os.path.join(cfg.networks_dir, 'tcga_top_genes_uw.tsv'))
save_numpy_to_el(adj_w, nodelist, os.path.join(cfg.networks_dir, 'tcga_top_genes_w.tsv'))
save_numpy_to_el(adj_random, nodelist, os.path.join(cfg.networks_dir, 'tcga_top_genes_random.tsv'))
np.savetxt(os.path.join(cfg.networks_dir, 'tcga_top_genes_nodelist.tsv'), nodelist, fmt='%s', delimiter='\t')