In [1]:
from pyNBS import data_import_tools as dit
from pyNBS import network_propagation as prop
from pyNBS import pyNBS_core as core
from pyNBS import pyNBS_single
from pyNBS import consensus_clustering as cc
from pyNBS import pyNBS_plotting as plot
from sklearn.metrics.cluster import adjusted_mutual_info_score, adjusted_rand_score
import time
import networkx as nx
import pandas as pd
import numpy as np
from IPython.display import Image

# Load Data

#### Load non-default parameters
Notes about the parameter file:
 - If no parameter path is given, default parameters will be set instead (see documentation for details and default values). 
 - The parameter file is a 2-column comma-separated text file where the first column is the parameter name, and the second column is the parameter value. The delimiter for this file must be a comma. 
 - Blank lines and lines starting with "#" will be ignored.
 - The parameter file may include as many or as few of the parameters from the pyNBS overall parameter space. For examples of two parameter files see: ```./OV_run_pyNBS_Hofree_params.csv``` VS ```./run_pyNBS_default_params.csv```
 
 
An excerpt of the the default parameters file is given below:  
```
################################  
#   Overall pyNBS parameters   #  
################################  
verbose,True  
outdir,./Results/  
  
###############################  
#   Data Loading Parameters   #  
###############################  
net_filedelim,"	"  
mut_filetype,matrix  
mut_filedelim,","  
degree_preserved_shuffle,False  
node_label_shuffle,False  
```

In [2]:
pyNBS_params = dit.load_params(params_file='./OV_run_pyNBS_Hofree_params.csv')

#### Load molecular network
The network file is a 2-column text file representing an unweighted network. Each row represents a single edge in the molecular network.    
  
Notes about the network file:  
 - The default column delimiter is a tab character '\t' but a different delimiter can be defined by the user here or in the parameter file with the "net_filedelim" parameter.
 - The network must not contain duplicate edges (e.g. TP53\tMDM2 is equivalent to MDM2\tTP53)
 - The network must not contain self-edges (e.g. TP53\tTP53)
 - Only the first two columns of a network file are read as edges for the network, all other columns will be ignored.
 - The load_network function also includes options to read in edge- or label-shuffled versions of the network, but by default, these options are turned off.
 
An excerpt of the first five rows of the PID network file is given below:  
```
A1BG	A2M
A1BG	AKT1
A1BG	GRB2
A1BG	PIK3CA
A1BG	PIK3R1
```

In [3]:
# The only required parameter here is the network file path, it may be defined by the user in the parameter file
# or explicitly here.
network_filepath = pyNBS_params['network_file']
network = dit.load_network_file(network_filepath, verbose=pyNBS_params['verbose'])

Network File Loaded: ./Example_Notebook_Data/Network_Files/HM90.sif


#### Load binary somatic mutation data
The binary somatic mutation data file can be represented in two file formats:  
The ```matrix``` binary somatic mutation data format. This file format is a binary csv or tsv matrix with rows represent samples/patients and columns represent genes.  The following table is a small excerpt of a matrix somatic mutation data file:  

||A1CF|A2BP1|A2M|
|-|-|-|-|
|TCGA-04-1638|0|0|1|
|TCGA-23-1029|1|0|0|
|TCGA-23-2647|0|1|0|
|TCGA-24-1847|0|0|1|
|TCGA-42-2589|1|0|0|

The ```list``` binary somatic mutation data format. This file format is a 2-column csv or tsv list where the 1st column is a sample/patient and the 2nd column is a gene mutated in the sample/patient. There are no headers in this file format. Loading data with the list format is typically faster than loading data from the matrix format.The following text is the list representation of the matrix above.
```
TCGA-04-1638	A2M
TCGA-23-1029	A1CF
TCGA-23-2647	A2BP1
TCGA-24-1847	A2M
TCGA-42-2589	A1CF
```

In [5]:
# The only required parameter here is the somatic mutation data file path, it may be defined by the user in the 
# parameter file or explicitly here.
sm_data_filepath = pyNBS_params['sm_data_file']
sm_mat = dit.load_binary_mutation_data(sm_data_filepath, 
                                       filetype=pyNBS_params['mut_filetype'], 
                                       delimiter=pyNBS_params['mut_filedelim'], 
                                       verbose=pyNBS_params['verbose'])

Binary Mutation Matrix Loaded: ./Example_Notebook_Data/Mutation_Files/OV_sm_mat_Hofree.csv
