# Module 3


**Function:** 

a) Distributes clusters identified in module 2_2 into individual structures

b) Identifies clusters in individual structures.

**Flow:** Using primes products identifies structures with clusters and save
them in distribution matrix. Then using Leiden's algorith identifies clusters in individual structures

**Input:** population of structures (filename format: cf_XXXXXX.coords.csv, XXXXXX - structure id completed to six zeros), list of primes, primes products, list of clusters

**Output:** distribution matrix, list of clusters in individual structures

**Usage:** Provide path to csv file with variables and run the notebook


<img src="3.png" alt="drawing"  width="750" align="center"/>


## path to parameters

In [None]:
### ENTER PATH TO CSV FILE WITH PARAMETERS ###
path_to_parameters = ''

## libraries

In [None]:
import numpy as np
import umap
import umap.umap_ as umap

import cdlib
from cdlib import algorithms
import networkx as nx


import warnings
warnings.filterwarnings('ignore')

import time
import seaborn as sns;
import matplotlib.pyplot as plt


import ray
from scipy.sparse import csr_matrix
import csv

import pickle

import scipy.spatial
from scipy import sparse
import itertools
import os

## functions

In [None]:
def read_from_pickle(path):
    file_to_read = open(path, "rb")
    loaded_object = pickle.load(file_to_read)
    file_to_read.close()
    return loaded_object

In [None]:
def get_indicies_for_cluster(cluster,product,primes):
    """
    gets structures in which given cluster occurs
    """
    # turn cluster beads to indicies
    cluster_indexes = [i - 1 for i in cluster]
    # turn indicies to primes product
    cluster_primes_product = np.prod(primes[cluster_indexes])
    # check occurence
    indicies = (np.where(product % cluster_primes_product == 0)[0])
    
    return indicies

In [None]:
def get_distributions_for_population(clusters_list,product_full,primes):
    clusters_in_structures = np.zeros((number_of_structures,number_of_beads_per_structure))
    for pair in clusters_list:
        clusters = pair[1]
        if len(clusters) > 0:
            bead = pair[0]
            bead_index = bead - 1
            for cl in clusters:
                str_indicies = get_indicies_for_cluster(cl,product_full[:,bead_index],primes)
                for str_index in str_indicies:
                    for b in cl:
                        clusters_in_structures[str_index,b-1] = 1
                        
    return clusters_in_structures   

## load parameters, build folders

In [None]:
# load paramaters from csv file 

# parse csv file with parameters
paramaters = []
with open(path_to_parameters, 'rt') as csvfile:
    reader = csv.reader(csvfile, skipinitialspace=True)
    paramaters.append(list(reader))
    csvfile.close()

#list with setup parameters
params = paramaters[0]    

In [None]:
#assign setup variebles from params

home = params[0][1]
number_of_structures = int(params[1][1])
number_of_beads_per_structure = int(params[2][1])
fraction = float(params[3][1])
structures_fraction = number_of_structures * fraction
cores = int(params[4][1])
dataset_name =  params[6][1]
dataset_folder =  params[7][1]
a_type = params[8][1]
chromosomal_borders_file = params[9][1]
primes_file = params[10][1]
cutoff = fraction = float(params[11][1])

In [None]:
# compose analysis name

if a_type == 'fixed':
    r_factor  = float(params[5][1])
    analysis_name = dataset_name + '_fixed_radius_' + str(r_factor)

if a_type == 'neighbours':
    k  = int(params[5][1])
    analysis_name = dataset_name + '_neighbours_' + str(k)

In [None]:
# handle dataset_name depending on analysis_type

if a_type == 'fixed':
    r_factor  = float(params[5][1])
    dataset_name = dataset_name + '_fixed_radius_' + str(r_factor)

if a_type == 'neighbours':
    k  = int(params[5][1])
    dataset_name = dataset_name + '_neighbours_' + str(k)

In [None]:
# print setup variables for manual inspection
print("")
print("Running cluster detection in structures")
print("")

print("dataset name: " + dataset_name)

print("loaded setup variables")
print("")
print("home folder: " + home)
print("dataset folder: " + dataset_folder)
print("dataset name: " + dataset_name)
print("number of structures: " + str(number_of_structures))
print("number of beads per structure: " + str(number_of_beads_per_structure))
print("fraction: " + str(fraction))

if a_type == "fixed":
    print("radius factor: " + str(r_factor))
if a_type == 'neighbours':
    print("k: " + str(k))

print("cores: " + str(cores))
print("")
print("cutoff: " + str(cutoff))



In [None]:
# PATHS

helper_folder = os.path.join(home,'helper_data')

primes_array = np.load(os.path.join(helper_folder,primes_file),allow_pickle=True)


## distrubute clustered beads to structures

In [None]:
# LOAD clusters

clusters_path = os.path.join(home,'runs',analysis_name,'results' , analysis_name + '_clusters_simple_filtered')
clusters_list = read_from_pickle(clusters_path)

In [None]:
# load product

product_full_path = os.path.join(home,'runs',analysis_name,'intermediate', 'products','product_full.npy')
product_full = np.load(product_full_path,allow_pickle=True)

In [None]:
# identify cluster

start_distributing = time.time()

distro = get_distributions_for_population(clusters_list,product_full,primes_array)

print(str(time.time() - start_distributing))



In [None]:
fig,ax = plt.subplots(figsize = (5,5))
ax.hist(distro.sum(axis=1))
fig.savefig(os.path.join(home,'runs',analysis_name,'figures',analysis_name + '_distro_hist.png'))

In [None]:
distro_full_path = os.path.join(home,'runs',analysis_name,'intermediate',analysis_name + '_distro_matrix')
np.save(distro_full_path,distro)

## clustered beads quantities

## identify clusters in structures

In [None]:
# helper funcitons

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
@ray.remote
def build_communities(num,clusters_array,cutoff):
    file = os.path.join(dataset_folder,'cf_' + str(num).zfill(6) + '.coords.csv')
    csv = np.genfromtxt(file,delimiter=',')
    coordinates = csv[:,:3]
    clustered = np.where(clusters_array == 1)[0]
    if len(clustered) == 0:
        return num, []
    coords_for_communities = coordinates[clustered]
    distances = scipy.spatial.distance.pdist(coords_for_communities)
    distances_matrix = scipy.spatial.distance.squareform(distances)
    adj_matrix = np.zeros((len(clustered),len(clustered)))
    for i in range(len(clustered)):
        for j in range(len(clustered)):
            if distances_matrix[i,j] <= cutoff:
                adj_matrix[i,j] = 1
    g = nx.from_numpy_matrix(adj_matrix)
    eset = [(u, v) for (u, v, d) in g.edges(data=True)] # get list of edges from graph
    weights = [d['weight'] for (u, v, d) in g.edges(data=True)] # get list of weights from edges
    # find communities
    # in this example we use the Leiden algorithm
    leiden_coms = algorithms.leiden(g,weights=weights) # check if the algo is stochastic, in that case set rnd generator    
    leiden_coms.communities # a list of lists of nodes
    communities_in_str = []
    for community in leiden_coms.communities:
        communities_in_str.append(clustered[community])
    return num,communities_in_str 

In [None]:
# load list
list_path = os.path.join(home,'runs',analysis_name,'intermediate',analysis_name + '_sparse_neighbour_matrices_list')
list_of_str = read_from_pickle(list_path)

In [None]:
# change list_of_str to lighter
list_str = [i[1] for i in list_of_str]

In [None]:
# loop through matrix and structures:

# begin multiprocessing

ray.init()

start = time.time()

results = []

path_to_store = os.path.join(home,'runs',analysis_name,'intermediate',analysis_name + '_clusters_in_structures')

# process bead by bead

#for chunk in chunker(range(number_of_beads_per_structure),cores):


for chunk in chunker(range(number_of_structures),cores):
    communities = [build_communities.remote(list_str[i],distro[i],cutoff) for i in chunk]
    partial_results = ray.get(communities)
    results.append(partial_results)

    file_to_store = open(path_to_store, "wb")
    pickle.dump(results, file_to_store)

    file_to_store.close()


print(time.time() - start)




In [None]:
# collect clusters in one list
chained_results = list(itertools.chain(*results))
cluster_sizes = []
for i in chained_results:
    for j in i[1]:
        cluster_sizes.append(len(j))


In [None]:
plt.hist(cluster_sizes,bins=20)
