In [1]:
from pyNBS import data_import_tools as dit
from pyNBS import network_propagation as prop
from pyNBS import pyNBS_core as core
from pyNBS import pyNBS_single
from pyNBS import consensus_clustering as cc
from pyNBS import pyNBS_plotting as plot
import time
import pandas as pd
import numpy as np

### Running Parameters

|Param|Help|
|:------|:------|
|verbose|Verbosity flag for reporting on patient similarity network construction steps.|
|sm_data_file|Path to binary mutation matrix file. May be a csv or 2-column list where each line is a sample and the gene mutated separated by a common delimiter.|
|network_path|Path to molecular network file. File must be table where each line is a gene interaction separated by a common delimiter and the first 2 columns represent interacting proteins.|
|mut_filetype|File structure of binary mutation data. 2 options: "matrix" (e.g. csv or tsv) or "list" (2-column list). Typically reading a "list" is faster.|
|mut_filedelim|Delimiter used in binary mutation file. Default is tab white space.|
|net_filedelim|Delimiter used in network file between columns. Default is tab white space.|
|degree_preserved_shuffle|Determination of whether or not to shuffle the network edges (while preserving node degree) when loading network.|
|node_label_shuffle|Determination of whether or not to shuffle the network node labels (while preserving network topology) when loading network.|
|regularize_network|Determination of whether or not to calculate influence matrix regularization network for regularized NMF step.|
|reg_net_gamma|Value of adjustment on propagation network graph laplacian to calculate influence matrix for (via Vandin 2011).|
|k_nearest_neighbors|Number of nearest neighbors to add to the regularization network during construction.|
|save_knn_glap|File path of where to save graph laplacian for k-nearest-neighbor network constructed from propagation network influence matrix. No path given as default, automatically saves pandas hdf file if file path given.|
|regularization_network_graph_laplacian_file|Path to regularization network graph laplacian matrix if previously calculated. Required if 'regularize_network' is False.|
|niter|Number of iterations to perform sub-sampling and network-regularized NMF before consensus clustering.|
|propagate_data|Determination of whether or not to propagate sub-sampled binary mutation data over given molecular network.|
|calculate_propagation_kernel|Determination of whether or not to pre-calculate network kernel for network propagation. Highly recommended if no network kernel file is given already and niter > 10.|
|propagation_kernel_file|Path to pre-calculated propagation kernel of network. This will save time in the propagation step.|
|save_H|File path of where to save decomposed patient profiles. No path given as default, automatically saves csv file if file path given.|
|consensus_cluster|Determination of whether or not to perform consensus clustering on decompositions of patient profiles.|
|assign_clusters|Determination of whether or not to assign numerical clusters to patients based on consensus clustering of patient profiles.|
|save_co_cluster_matrix|File path of where to save patient co-clustering matrix. No path given as default, automatically saves csv file if file path given.|
|save_cluster_assignments|File path of where to save patient cluster assignments. No path given as default, automatically saves csv file if file path given.|
|plot_co_cluster_map|Determination of whether or not to plot the co-clustering matrix. Requires consensus clustering and cluster assignments.|
|plot_title|Title of co-clustering matrix map if desired.|
|save_co_cluster_map|File path of where to save co-clustering matrix plot. No path given as default, automatically saves pdf file if file path given.|




In [59]:
run_pyNBS_params = {'verbose' : False,
                    ##### Path for Ovarian #####
                    'sm_data_file' : '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/TCGA_sm_data/processed/OV_sm_data_filt.txt',
                    'network_path' : '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/NBS_v0.2.0_Matlab_data/HM90.sif',
                    
                    ##### Path for Uterine #####
                    #'sm_data_file' : '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/TCGA_sm_data/processed/UCEC_sm_data.txt',
                    #'network_path' : '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/NBS_v0.2.0_Matlab_data/ST90.sif',
                    
                    'mut_filetype' : 'list',
                    'mut_filedelim' : '\t',
                    'net_filedelim' : '\t',
                    'degree_preserved_shuffle' : False,
                    'node_label_shuffle' : False,
                    'regularize_network' : True,
                    'reg_net_gamma' : 0.01,
                    'k_nearest_neighbors' : 11,
                    'save_knn_glap' : None,
                    'regularization_network_graph_laplacian_file' : None,
                    #'propagate_data' : True,
                    'niter' : 10, # 1000 default
                    'calculate_propagation_kernel' : False,
                    'propagation_kernel_file' : None,
                    'save_H' : None,
                    'consensus_cluster' : True,
                    'assign_clusters' : True,
                    'save_co_cluster_matrix' : None,
                    'save_cluster_assignments' : None,
                    'plot_co_cluster_map' : True,
                    'plot_title' : False,
                    'save_co_cluster_map' : None,
                    }

### NBS Options

|Option|Type|Default|Help|
|:------|:------|:------|:------|
|pats_subsample_p|float|0.8|Proportion of samples to sub-sample|
|gene_subsample_p|float|0.8|Proportion of mutated genes to sub-sample|
|min_muts|positive_int|10|Minimum number of mutations for a sample to contain after sub-sampling to be considered for further analysis|
|prop_data|bool|True|Determination of whether or not to propagate sub-sampled binary mutation data over given molecular network 
|prop_alpha  |restricted_float|0.7|Propagation constant to use in the propagation of mutations over molecular network. Range is 0.0-1.0 exclusive. |
|prop_symmetric_norm|bool|False|Network degree normalization method for random walk-propagation. |
|qnorm_data  |bool  |True |Determination of whether or not to qunatile normalize mutation profiles. |
|netNMF_k  |positive_int  |4 |Number of components to decompose patient mutation data into. Same as the number of clusters of patients to separate data into. |
|netNMF_gamma  |positive_int  |200 |Regularization constant to scale network regularization term in netNMF. |
|netNMF_update_gamma |bool |False |Determination of whether or not to constantly update regularization constant based on balance between reconstruction error and regularization term.|
|netNMF_gamma_factor |positive_int |1 |Scaling factor for regularization constant updates if 'netNMF_update_gamma' is True. |
|netNMF_niter  |positive_int  |250 |Maximum umber of multiplicative updates to perform within network-regularized NMF if result does not converge. |
|netNMF_eps  |float  |1e-15  |Epsilon error value to adjust 0 values during multiplicative matrix updates in netNMF |
|netNMF_err_tol  |float  |1e-4  |Minimum error tolerance for matrix reconstruction of original data for convergence. |
|netNMF_err_delta_tol  |float  |1e-4  |Minimum error tolerance for l2 norm of difference in matrix reconstructions between iterations of netNMF for convergence. |

In [60]:
NBS_options = {'pats_subsample_p' : 0.8, 
               'gene_subsample_p' : 0.8, 
               'min_muts' : 10,
               'prop_data' : True, 
               'prop_alpha' : 0.7, 
               'prop_symmetric_norm' : False, 
               'qnorm_data' : True,
               'netNMF_k' : 4, 
               'netNMF_gamma' : 200, 
               'netNMF_update_gamma' : False, 
               'netNMF_gamma_factor' : 1,
               'netNMF_niter' : 250, 
               'netNMF_eps' : 1e-15, 
               'netNMF_err_tol' : 1e-4, 
               'netNMF_err_delta_tol' : 1e-4}


### Load data

In [44]:
# Load somatic mutation data
sm_mat = dit.load_binary_mutation_data(run_pyNBS_params['sm_data_file'], filetype=run_pyNBS_params['mut_filetype'], delimiter=run_pyNBS_params['mut_filedelim'], verbose=run_pyNBS_params['verbose'])
# Load network
network = dit.load_network_file(run_pyNBS_params['network_path'], delimiter=run_pyNBS_params['net_filedelim'], degree_shuffle=run_pyNBS_params['degree_preserved_shuffle'], 
                                label_shuffle=run_pyNBS_params['node_label_shuffle'], verbose=run_pyNBS_params['verbose'])


### knnGlap

In [47]:
# Get knnGlap
if run_pyNBS_params['regularize_network']:
    knnGlap = core.network_inf_KNN_glap(network, gamma=run_pyNBS_params['reg_net_gamma'], kn=run_pyNBS_params['k_nearest_neighbors'], verbose=run_pyNBS_params['verbose'], save_path=run_pyNBS_params['save_knn_glap'])
else:
    # Load propatagion kernel
    if run_pyNBS_params['regularization_network_graph_laplacian_file'].endswith('.hdf'):
        knnGlap = pd.read_hdf(run_pyNBS_params['regularization_network_graph_laplacian_file'])
    else:
        knnGlap = pd.read_csv(run_pyNBS_params['regularization_network_graph_laplacian_file'])
    if run_pyNBS_params['verbose']:
        print 'Pre-calculated regularization network graph laplacian loaded'


### Network propagation kernel

In [50]:
# Get network propagation kernel
if run_pyNBS_params['propagation_kernel_file'] is not None:
    # Load propagation kernel
    if run_pyNBS_params['propagation_kernel_file'].endswith('.hdf'):
        kernel = pd.read_hdf(run_pyNBS_params['propagation_kernel_file'])
    else:
        kernel = pd.read_csv(run_pyNBS_params['propagation_kernel_file'])
    if run_pyNBS_params['verbose']:
        print 'Pre-calculated network kernel loaded'
else:
    if run_pyNBS_params['calculate_propagation_kernel']:
        # Calculate propagation kernel by propagating identity matrix of network
        network_nodes = network.nodes()
        network_I = pd.DataFrame(np.identity(len(network_nodes)), index=network_nodes, columns=network_nodes)
        kernel = prop.network_propagation(network, network_I, NBS_options['prop_alpha'], verbose=True)  
        if run_pyNBS_params['verbose']:
            print 'Network kernel calculated'
    else:
        kernel = None
        if run_pyNBS_params['verbose']:
            print 'No network kernel established'

### Sub-sampling and netNMF decomposition

In [62]:
Hlist = []
for i in range(run_pyNBS_params['niter']):
    netNMF_time = time.time()
    Hlist.append(pyNBS_single.NBS_single(sm_mat, NBS_options, propNet=network, propNet_kernel=kernel, regNet_glap=knnGlap, verbose=False, save_path=run_pyNBS_params['save_H']))
    if run_pyNBS_params['verbose']:
        print 'NBS iteration:', i+1, 'complete:', time.time()-netNMF_time, 'seconds'


ValueError: array must not contain infs or NaNs

### Consensus Clustering

In [64]:
if run_pyNBS_params['consensus_cluster']:
    NBS_cc_table, NBS_cc_linkage, NBS_cluster_assign = cc.consensus_hclust_hard(Hlist, NBS_options['netNMF_k'], assign_cluster=run_pyNBS_params['assign_clusters'])
    if run_pyNBS_params['verbose']:
        print 'Consensus Clustering complete'        
    if run_pyNBS_params['save_co_cluster_matrix'] is not None:
        NBS_cc_table.to_csv(run_pyNBS_params['save_co_cluster_matrix'])
        if run_pyNBS_params['verbose']:
            print 'Co-clustering matrix saved'
    if run_pyNBS_params['save_cluster_assignments'] is not None:
        NBS_cluster_assign.to_csv(run_pyNBS_params['save_cluster_assignments'])
        if run_pyNBS_params['verbose']:
            print 'Cluster assignments saved'


ValueError: The number of observations cannot be determined on an empty distance matrix.