# Module 2_2


**Function:** Using binary matrix and prime products from module 2_1 returns list of clusters per bead across all population.

**Flow:** For each bead extract all pairs that are above treshold, for each pair check all potential triplets, and analogicaly quadruplets.

**Input:** Binary matrix, products of primes, primes list, chromosomal borders.

**Output:** List of clusters (of 4) across population per beads



**Usage:** Provide path to csv file with variables and run the notebook

<img src="2_2_R_jup.png" alt="drawing"  width="750" align="left"/>






## path to parameters

In [None]:
### ENTER PATH TO CSV FILE WITH PARAMETERS ###
path_to_parameters = ''

## libraries

In [None]:
import numpy as np
import scipy.spatial
import ray
import os
import time
import re
import pickle
from scipy import sparse
import time
import matplotlib.pyplot as plt
import sys
import csv
import itertools
from itertools import combinations

## functions

In [None]:
# helper funcitons

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
def get_beads_of_interest(bead_number,neigbours_array,borders_array,structures_fraction,number_of_beads_per_structure,primes_array):

    """
    takes a bead and returns indicies of beads that are in the neighbourhood of this bead in the desired fraction of structures

    """

    bead_index = bead_number - 1
 

    # read chromosomal borders - first and last bead for given chromosome
    chrom_limits = borders_array[bead_index]
    # bead = 0 -> [1,249]
    # bead NUMBERS

    # change from ids to indexes -> therefore -1
    first_bead_of_chromosome_index = chrom_limits[0] - 1 # beacuse it  reads x-1
    last_bead_of_chromosome = chrom_limits[1] - 1

    # pick row
    bead_of_interest = neigbours_array[bead_index]

    # get indicies of contacts above treshold
    bead_of_interest_indicies = np.where(bead_of_interest >= structures_fraction)[0] #structes fraction because it's frequencies array

    # filter for valus under these indicies


    right_slice =  np.where(bead_of_interest_indicies < first_bead_of_chromosome_index)[0]
    left_slice = np.where(bead_of_interest_indicies >last_bead_of_chromosome )[0]
    left_and_right = np.append(right_slice,left_slice)

    indicies = bead_of_interest_indicies[left_and_right]

    # RETURNS INDICES OF BEADS
    



    return indicies


In [None]:
@ray.remote
def pick_fours(bead_number,neighbours_array,borders,product_folder,structures_fraction,number_of_beads_per_structure,primes_array):

    # bead numbers         [1,2,3,4,5]
    # primes_array indexes [0,1,2,3,4]
    # primes               [2,3,5,7,11]

    # load a product for given bead (product was saved under bead id)

    prod = np.load(product_folder + '/prod_' + str(bead_number) +'.npy',allow_pickle=True)

    # add triplets here

    bead_triplets = []

    bead_index = bead_number - 1


    # get all indicies of bead in trans chromosomes
    bead_x_indicies = get_beads_of_interest(bead_number,neighbours_array,borders,structures_fraction,number_of_beads_per_structure,primes_array) # indices
    #get all possible doublets (in reality they are triplets, because of bead of origin)
    bead_0_indicies_all = [i for i in range(number_of_beads_per_structure) if (neighbours_array[bead_index,i] >= structures_fraction and i != bead_index)]

    # combine cis and trans
    bead_0_indicies_all_combined = list(combinations(bead_0_indicies_all,2))

    # filter combined for possible triplets
    bead_0_indicies_all_filtered = [(i[0],i[1]) for i in bead_0_indicies_all_combined if neighbours_array[i[0],i[1]] >= structures_fraction ]

    bead_0_indicies_all_filtered_subset = bead_0_indicies_all_filtered

    # bead numbers         [1,2,3,4,5]
    # primes_array indexes [0,1,2,3,4]
    # primes               [2,3,5,7,11]
    




    # neighbours for bead
    bead_0_str = prod
    primes_inter = primes_array[bead_x_indicies] # trans beads
    for i in bead_0_indicies_all_filtered_subset: # all doublets that passed >= fraction
        product = primes_array[i[0]]*primes_array[i[1]] # doublet as primes
        mods = np.mod(bead_0_str,product) # array with remainders
        ins = np.where(mods == 0)[0] # indicies of structures in which they are present
        ins_sum = ins.shape
   
        if ins_sum[0] >= structures_fraction: # if for given doublet there is more than hundred structures

        # subset of structures where doublet is present

            bead_0_str_subset = bead_0_str[ins]    # subset of strucutres to check

            potential_triplets_product = primes_inter * product  # THAT'S WHERE TRANS ARE ADDED

            potential_triplets_product_full = np.full((bead_0_str_subset.shape[0],potential_triplets_product.shape[0]),potential_triplets_product)
            mods_2 = np.mod(bead_0_str_subset,potential_triplets_product_full.T) # remainders for triplets

       #mods_2_reshape = np.reshape(mods_2(products_2.shape[0],))

            ins_2_0 = np.where(mods_2 == 0)[0] # by 3rd element [indexes of ]
            ins_2_1 = np.where(mods_2 == 0)[1] # by structure number

            val,counts = np.unique(ins_2_0,return_counts=True)


            counts_more_than_hundres = np.where(counts >= structures_fraction)

   
            indexes_of_primes_sybset_passing_the_treshold = val[counts_more_than_hundres[0]]

            #print(indexes_of_primes_sybset_passing_the_treshold)
            pr = primes_inter[indexes_of_primes_sybset_passing_the_treshold]
        #print(pr)
        #print(i)
            for p in pr:
                t = (int(np.where(primes_array==p)[0]))
                if (i[0] != i[1] and i[0] != t and i[1] != t): # retruns index of prime, to get bead: add 1
                    #bead_triplet = [i[0],i[1],t]
                    bead_triplet = [i[0]+1,i[1]+1,t+1] # returns beads IDS

                    bead_triplet_sorted = tuple(sorted(bead_triplet))
                    bead_triplets.append(bead_triplet_sorted)


    if len(bead_triplets) > 0:
        print(bead_number,len(bead_triplets))
   


    return (bead_number, bead_triplets)

In [None]:
def filter_clusters(cluster):
    filtered_clusteres = []
    for triplet in cluster[1]:
        triplet_list = list(triplet)
        triplet_list_sorted = sorted(triplet_list)
        if triplet_list_sorted not in filtered_clusteres:
            filtered_clusteres.append(triplet_list_sorted)
    return cluster[0],filtered_clusteres 

## loading parameters, building folders

In [None]:
# load paramaters from csv file 

paramaters = []
with open(path_to_parameters, 'rt') as csvfile:
    reader = csv.reader(csvfile, skipinitialspace=True)
    paramaters.append(list(reader))
    csvfile.close()

params = paramaters[0]

In [None]:
#assign setup variebles from params

home = params[0][1]
number_of_structures = int(params[1][1])
number_of_beads_per_structure = int(params[2][1])
fraction = float(params[3][1])
structures_fraction = number_of_structures * fraction
cores = int(params[4][1])
dataset_name =  params[6][1]
dataset_folder =  params[7][1]
a_type = params[8][1]

chromosomal_borders_file = params[9][1]
primes_file = params[10][1]

In [None]:
# handle dataset_name depending on analysis_type

if a_type == 'fixed':
    r_factor  = float(params[5][1])
    analysis_name = dataset_name + '_fixed_radius_' + str(r_factor)
    
    
    

if a_type == 'neighbours':
    k  = int(params[5][1])
    analysis_name = dataset_name + '_neighbours_' + str(k)

In [None]:
# print setup variables for manual inspection
print("")
print("Running population-wide cluster detection")
print("")
print("analysis name: " + analysis_name)

print("loaded setup variables")
print("")
print("home folder: " + home)
print("dataset folder: " + dataset_folder)
print("dataset name: " + dataset_name)
print("number of structures: " + str(number_of_structures))
print("number of beads per structure: " + str(number_of_beads_per_structure))
print("fraction: " + str(fraction))

if a_type == "fixed":
    print("radius factor: " + str(r_factor))
if a_type == 'neighbours':
    print("k: " + str(k))

print("cores: " + str(cores))
print("")


In [None]:
# defin regex patter for csv files
pattern = '^(.*)cf_(.*).coords.csv$'


In [None]:
run_folder = os.path.join(home,'runs',analysis_name)
intermediate_files_folder = os.path.join(run_folder,'intermediate')
product_folder = os.path.join(intermediate_files_folder,"products")
results_folder = os.path.join(run_folder,'results')
figures_folder = os.path.join(run_folder,'figures')


In [None]:
# load stored input data

#  SAVE FILES
binary = np.load(os.path.join(intermediate_files_folder,analysis_name + '_binary.npy'))
frequencies = np.load(os.path.join(intermediate_files_folder,analysis_name + '_frequencies.npy'))

# HELPER DATA FILES

helper_folder = os.path.join(home,'helper_data')



# load and process helper data

primes_array = np.load(os.path.join(helper_folder,primes_file),allow_pickle=True)

# CHROMOSOMAL BORDERS ARRAY

chromosomal_borders = np.load(os.path.join(helper_folder,chromosomal_borders_file))

# get chromosomal borders

borders_array = np.unique(chromosomal_borders)


## identify cluster

In [None]:
# begin multiprocessing

ray.init()

In [None]:


start = time.time()

results = []

path_to_store = intermediate_files_folder + analysis_name +  "_final_clusters_running"

# process bead by bead

for chunk in chunker(range(number_of_beads_per_structure),cores):    

    ids = [pick_fours.remote((bi+1),frequencies,chromosomal_borders,product_folder,structures_fraction,number_of_beads_per_structure,primes_array) for bi in chunk]
    partial_results = ray.get(ids)
    results.append(partial_results)

    file_to_store = open(path_to_store, "wb")
    pickle.dump(results, file_to_store)

    file_to_store.close()

print("cluster identification took " + str(time.time() - start) + " s")

In [None]:
# put all the lists together

import itertools
clusters_list = results[0]
for i in range(1,len(results)):
    clusters_list = list(itertools.chain(clusters_list,results[i]))
    
filtered_clusters = [filter_clusters(i) for i in clusters_list]



## prepare figures, save files

In [None]:
# save

filtered_clusters_list_to_save_path = os.path.join(results_folder,analysis_name + '_clusters_simple_filtered')
file_to_store = open(filtered_clusters_list_to_save_path, "wb")
pickle.dump(filtered_clusters, file_to_store)
file_to_store.close()

clusters_list_to_save = list(itertools.chain(*results[0]))
clusters_list_to_save_path = os.path.join(results_folder,analysis_name + '_clusters_simple')
file_to_store = open(clusters_list_to_save_path, "wb")
pickle.dump(clusters_list_to_save, file_to_store)
file_to_store.close()

In [None]:
# reformat data for plot with clusters distribution
beads_numbers = [i[0] for i in clusters_list]
cluster_numbers = [len(i[1]) for i in clusters_list]

In [None]:
# reformat data for plot with FILTERED clusters distribution
beads_numbers_filtered = [i[0] for i in filtered_clusters]
cluster_numbers_filtered = [len(i[1]) for i in filtered_clusters]

In [None]:
# prepare plot data for plot with clusters distribution
fig, ax = plt.subplots(2,1,figsize = (20,5))
ax[0].scatter(beads_numbers,cluster_numbers)
ax[1].scatter(beads_numbers_filtered,cluster_numbers_filtered)


fig.savefig(os.path.join(figures_folder,analysis_name + '_clusters_distribution.png'))

In [None]:
# save final clusters

path_to_store = intermediate_files_folder + analysis_name +  "final_clusters"


final_clusters = (results,beads_numbers,cluster_numbers)

file_to_store = open(path_to_store, "wb")
pickle.dump(final_clusters, file_to_store)

file_to_store.close()

In [None]:
ray.shutdown()