# Module 1- **in silico Hi-C**

**Function:** generation of in silico Hi-C matrix using the population of structures. 

**Data Flow:** Each structure is converted to a graph representation. Then using Leiden's algorithm it divides each graph into communities. Finally for each pair of beads if checks in how many structures they are members of the same community and generates in silico HiC matrix.

**Input:** population of structures (filename format: cf_XXXXXX.coords.csv, XXXXXX - structure id completed to 6 characters with zeros)

**Output:** in silico HiC matrix (npz format), list of communities and their members

**Usage:** Provide path to csv file with variables and run the notebook

<div>
<img src="module_1_dataflow_jup.png" alt="drawing"  align="center" width ="250"/>
</div>




# path to csv with paramaters

In [None]:
### ENTER PATH TO CSV FILE WITH PARAMETERS ###
path_to_parameters = ''

# libraries

In [None]:
# import libraries

import numpy as np
import umap
import umap.umap_ as umap

import cdlib
from cdlib import algorithms
import networkx as nx
from  scipy import sparse

import warnings
warnings.filterwarnings('ignore')

import time
import seaborn as sns;
import matplotlib.pyplot as plt

import os
import ray
from scipy.sparse import csr_matrix
import csv
from networkx import Graph
import re
import pickle

# functions

In [None]:
# divides a sequence type into chunks

def chunker(seq, size):
    """
    input: sequence (eg. list)
    ouptput: sequence divided in chunks
    parameters: size: determines the length of the chunk
    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
# helper decorator function for benchmarking

def my_timer(func):
    """
    input: function to be benchmarked
    ouptput: execution time of benchmarked function
    """
    def wrapper(*args,**kwargs):
        t_start = time.time()
        result = func(*args,**kwargs)
        t_end = time.time() - t_start
        print('{} took {}s'.format(func.__name__, t_end))

        return result

    return wrapper

In [None]:
def build_graph(XYZ,k):
    """
    Input: an array of (x,y,z) coordinates
    Output: the weighted adjacency matrix of the UMAP graph representation
    Parameters: k is the most important parameter in the umap fuzzy_simplicial_set function. 
    It will determine how sparse the final graph will be.
    """
#    umap.umap_.fuzzy_simplicial_set
    adj = umap.fuzzy_simplicial_set(
        XYZ,
        n_neighbors=k, # this parameter has to be fine-tuned
        random_state=np.random.RandomState(seed=42),
        metric='l2',
        metric_kwds={},
        knn_indices=None,
        knn_dists=None,
        angular=False,
        set_op_mix_ratio=1.0,
        local_connectivity=2.0,
        verbose=False,
        
        )
  
    return adj

In [None]:
def build_communities(adj):
    """
    Input: the weighted graph adjacency matrix
    Output: a list of communities, each one a represented as a list object
    leiden algorithm as implemented in the cdlib library.
    """
    # generate a graph networkx obj
    g = nx.from_scipy_sparse_matrix(adj) 
    # get list of edges from graph
    eset = [(u, v) for (u, v, d) in g.edges(data=True)]
    # get list of weights from edges
    weights = [d['weight'] for (u, v, d) in g.edges(data=True)] 
    # find communities using Leiden alg
    leiden_coms = algorithms.leiden(g,weights=weights) 
    # a list of lists of nodes
    return leiden_coms.communities 

In [None]:
#parrarel graph aggregator

@ray.remote
def aggregate_graphs(graph1,graph2):
    return graph1 + graph2
    

In [None]:
# structure -> graph -> communities

@ray.remote

def read_and_prepare_graph_and_communities(folder,num,k,number_of_beads_per_structure):
    
    '''
    input: csv file with coordinates for single structure
    ouput: graph, list of communities and id of structure (from csv name)
    parameters: k - is the most important parameter in the umap fuzzy_simplicial_set function. 
    It will determine how sparse the final graph will be.
    
    '''
    
    
    
    #obtain id of processed_structur
  
    
    filename = 'cf_' + str(num).zfill(6) + '.coords.csv'
    file = os.path.join(folder,filename)
    
    
    # read csv into np.array
    coordinates = np.genfromtxt(file, delimiter= ',')
    # get columns for x,y,z coordinates
    coordinates_xyz = coordinates[:,:3]
    # build a graph from x,y,z
    graph = build_graph(coordinates_xyz,k)
    # detect communities
    communities =  build_communities(graph[0])
    # communities are list of lists of lists : community / beads
    # obtained communities are used for as input to build a complete graph for given structure
    for community_index in range(len(communities)):
        # for the first community build graph
        if community_index == 0:
            community_graph = nx.complete_graph(communities[community_index]) 
        else:
         # for the following communities update graph
            community_graph.update(nx.complete_graph(communities[community_index]))
         #once done - return graph , list of communities and id of processed structure
    return nx.to_scipy_sparse_matrix(community_graph,nodelist=range(number_of_beads_per_structure)) , communities , num




In [None]:
@my_timer 

# parallelizing the whole process

def process_csvs(folder,cores,k,number_of_beads_per_structure):
    
    '''
    input: folder with csvs
    ouput: in silico HiC matrix for the population of structures in input folder, dictionary with communities
    
    parameters: 
    
    k - is the most important parameter in the umap fuzzy_simplicial_set function. 
    It will determine how sparse the final graph will be.
    cores - number of cores available
    number_of_beads_per_structure  
    
    '''
    
    # list to accumulate matrices
    results = []
    
    #counter to follow progres 
    counter = 0
    
    #dictionary for str:list of communities
    communities_ditc = {}
    
    #process multi
    for chunk in chunker(range(1,number_of_structures+1),cores):
        
        #initiate separate processes for each file in chunk 
        ids = [read_and_prepare_graph_and_communities.remote(folder,num,k,number_of_beads_per_structure) for num in chunk]
        # list to accumulate matrices from each file in chunk
        partial_results = []
        # get results from processes
        partial_results_triple = ray.get(ids)
        # wait till all processes are done
        ready, not_ready = ray.wait(ids,num_returns= len(chunk))
        # for each matrix,communities,structure_id
        for triple in partial_results_triple:
            # add entry structure_id : communities to communities dictionary
            communities_ditc[triple[2]] = triple[1]
            # add matrix to partial results
            partial_results.append(triple[0])
            
        
        
        # aggregate matrices to one sigle matrix (to save memory)
        while len(partial_results) > 1:
            partial_results = partial_results[2:] + [aggregate_graphs.remote(partial_results[0], partial_results[1])]
        folder_chunk_results = ray.get(partial_results[0])
        
        #append aggregated matrix to final results
        results.append(folder_chunk_results)
     
        #track progress
        counter += cores
        if counter%1000 == 0:
            print(counter)
        
        # aggregate final matrices (to save memory)
        while len(results) > 1:
            results = results[2:] + [aggregate_graphs.remote(results[0], results[1])]
        
            
    return results,communities_ditc
    

## loading parameters, building folders

In [None]:
# load paramaters from csv file 

# parse csv file with parameters
paramaters = []
with open(path_to_parameters, 'rt') as csvfile:
    reader = csv.reader(csvfile, skipinitialspace=True)
    paramaters.append(list(reader))
    csvfile.close()

#list with setup parameters
params = paramaters[0]    

In [None]:
#assign setup variebles from params

home = params[0][1]
number_of_structures = int(params[1][1])
number_of_beads_per_structure = int(params[2][1])
cores = int(params[3][1])
k = int(params[4][1])
dataset_name =  params[5][1]
dataset_folder =  params[6][1]


In [None]:
# compose analysis name

analysis_name = dataset_name + '_inSilico_' + str(k)

In [None]:
# print loaded parameters:

print('Analysis name: ' + str(analysis_name))
print('Home directory : ' + str(home))
print('Dataset name :' + dataset_name)
print('Dataset directory: ' + dataset_folder)
print('number of structures: ' + str(number_of_structures))
print('number of beads per structure: ' + str(number_of_beads_per_structure))
print('cores: ' + str(cores))
print('k for graph: ' + str(k))



In [None]:
# build folders structure

run_folder = os.path.join(home,'runs',analysis_name)
results_folder = os.path.join(run_folder,'results')
figures_folder = os.path.join(run_folder,'figures')

print("building folders structure for the run")


folders = [run_folder,results_folder,figures_folder]

for folder in folders:
    try:
        os.mkdir(folder)
        print("Directory " , folder ,  " Created ")
    except FileExistsError:
        print("Directory " , folder ,  " already exists")

print("")

## in silico HiC matrix preparation

In [None]:
# initialize ray

ray.init()

In [None]:
# run algorith on chosen dataset 

# process all structures

inSilicoHiC,communities_dict = process_csvs(dataset_folder,cores,k,number_of_beads_per_structure)

# get results  

out = ray.get(inSilicoHiC[0])


## saving files

In [None]:
# visualize matrix

# transform hic matrix to dense form
PA_dense = sparse.csr_matrix.todense(out)
# set up visualization theme
sns.set_theme()
# figsize in inches
fig, ax = plt.subplots(figsize=(20,15))         
ax = sns.heatmap(PA_dense)
# set up figure title
title = "in silico HiC for " + dataset_name + " k=" + str(k)
fig.suptitle(title)
# save figure
fig.savefig(os.path.join(figures_folder,'hic_' + analysis_name))

In [None]:
# save communities

# set up path to communities
file_to_store_communities = open(os.path.join(results_folder,'communities_' + analysis_name),'wb')
# save and close the file
pickle.dump(communities_dict,file_to_store_communities)
file_to_store_communities.close()


In [None]:
# save matrix

# set up path to matrix
file_to_store__HIC = os.path.join(results_folder,'HIC_matrix_' + analysis_name)

# save file
sparse.save_npz(file_to_store__HIC,out)
