# Module 2_1 -  nearest neighbours

**Function:** Prepares intermediate files used by module 2_2 for cluster identification.

**Flow:** For each structure and for each bead it extracts bead's neighbourhood, here defined as k nearrest beads.

**Input:** population of structures (filename format: cf_XXXXXX.coords.csv, XXXXXX - structure id completed to six zeros), list of primes

**Output:** product matrix (full and per bead), frequencies matrix (pairwise cooucurence), binary matrix (pairwise coocurence above certain treshold)

**Usage:** Provide path to csv file with variables and run the notebook


<img src="2_1_neighbours_jup.png" width=800  align="left"/>


## path to parameters

In [None]:
### ENTER PATH TO CSV FILE WITH PARAMETERS ###
path_to_parameters = ''

## libraries

In [None]:

import numpy as np
import scipy.spatial
from scipy import sparse
import ray
import os
import time
import re
import pickle
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import sys

## functions

In [None]:
def chunker(seq, size):
    """
    input: sequence (eg. list)
    ouptput: sequence divided in chunks
    parameters: size: determines the length of the chunk

    """
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
def structure_file_to_distances_matrix(structure_file):

    """
    input: csv file with coordinates
    output: distance matrix
    """

    # load csv
    csv = np.genfromtxt(structure_file,delimiter=',')

    # limit to first theree columns
    coordinates = csv[:,:3]

    # calculate all distances
    distances = scipy.spatial.distance.pdist(coordinates)

    # convert distance matrix to square form
    distances_matrix = scipy.spatial.distance.squareform(distances)

    return distances_matrix

In [None]:
@ray.remote
def structure_to_neigbours(structure_file,neighbours,number):
    
    """
    takes a csv file of a single structure and returns
    matrix of k-nearrest beads for each bead as sparse matrix, together with csv id number

    """
    
    # read csv and return a matrix with pairwise distances

    dist_mat = structure_file_to_distances_matrix(structure_file)
    
    # sort according to distances
    
    distances_matrix_sorted = np.sort(dist_mat, axis = 1)
    
    # argsort according to distances, returns indexes, + 1 to obtain beads ids
    
    distances_matrix_sorted_args = np.argsort(dist_mat, axis = 1) + 1 
    
    # first dimension should be equal to number of beads in structure
    
    first_dimension = dist_mat.shape[0]
    
    # second dimension is the k parameter
    
    second_dimension = neighbours
    
    # trim sorted beads ids -> [all beads, first neighbour:k-th neighbours]
    
    neighbours = distances_matrix_sorted_args[:,1:second_dimension+1]
    
    # convert to ushort - save mem
    
    neighbours = neighbours.astype(np.ushort)
    
    # covert to sparse form
    
    sparse_neighbours = sparse.csr_matrix(neighbours)

    return sparse_neighbours,number

In [None]:
def prepare_file_name(number):
    name = 'cf_' + number.zfill(6) + '.coords.csv'
    return name

## loading parameters, building folders

In [None]:
# start time to trace execution time

start_all = time.time()


In [None]:
# load paramaters from csv file 

# parse csv file with parameters
paramaters = []
with open(path_to_parameters, 'rt') as csvfile:
    reader = csv.reader(csvfile, skipinitialspace=True)
    paramaters.append(list(reader))
    csvfile.close()

#list with setup parameters
params = paramaters[0]    

In [None]:
# check if correct mode

mode = params[8][1]

if mode == 'fixed':
    print("incorrect mode - use either neighbours mode or 3_1_fixed")


In [None]:
#assign setup variebles from params

home = params[0][1]
number_of_structures = int(params[1][1])
number_of_beads_per_structure = int(params[2][1])
fraction = float(params[3][1])
structures_fraction = number_of_structures * fraction
cores = int(params[4][1])
k  = int(params[5][1])
dataset_name =  params[6][1]

dataset_folder =  params[7][1]
chromosomal_borders_file = params[9][1]
primes_file = params[10][1]


In [None]:
# compose analysis name

analysis_name = dataset_name + '_neighbours_' + str(k)

In [None]:
# print setup variables for manual inspection
print("")
print('preprocessing structures for ' + str(k) +' the nearrest neighbours')
print("")

print("loaded setup variables")
print("")
print("home folder: " + home)
print("dataset folder: " + dataset_folder)
print("dataset name: " + dataset_name)
print("number of structures: " + str(number_of_structures))
print("number of beads per structure: " + str(number_of_beads_per_structure))
print("fraction: " + str(fraction))
print("k: " + str(k))
print("cores: " + str(cores))
print("")

In [None]:
# load and process helper data

# build paths for helper data files

helper_folder = os.path.join(home,'helper_data')

    # primes array

primes_array = np.load(os.path.join(helper_folder,primes_file),allow_pickle=True)

    # chromosomal borders array

chromosomal_borders = np.load(os.path.join(helper_folder,chromosomal_borders_file))

    # transform borders matrix into 1D

borders_array = np.unique(chromosomal_borders)



In [None]:
# build folders structure

run_folder = os.path.join(home,'runs',analysis_name)
intermediate_files_folder = os.path.join(run_folder,'intermediate')
product_folder = os.path.join(intermediate_files_folder,"products")
results_folder = os.path.join(run_folder,'results')
figures_folder = os.path.join(run_folder,'figures')

print("building folders structure for the run")


folders = [run_folder,intermediate_files_folder,product_folder,results_folder,figures_folder]

for folder in folders:
    try:
        os.mkdir(folder)
        print("Directory " , folder ,  " Created ")
    except FileExistsError:
        print("Directory " , folder ,  " already exists")

print("")

In [None]:
# start multiprocessing 

ray.init()

## extract neighbourhoods

In [None]:
# process structures in batch:
# csv -> distances -> k neighbours
# and prepare list of tuples (neighbours matrix,id of structure)
# should be ordered by ids

files = os.listdir(dataset_folder)

sparse_matrices = []

for numbers_chunk in chunker(list(range(1,(number_of_structures+1))),cores):
    ids = [structure_to_neigbours.remote(os.path.join(dataset_folder,prepare_file_name(str(number))),k,number) for number in numbers_chunk]
    partial_results = ray.get(ids)
    sparse_matrices.append(partial_results)
    

# put all matrices in one list

final_list = sparse_matrices[0]
for i in range(1,len(sparse_matrices)):
    final_list = list(itertools.chain(final_list,sparse_matrices[i]))

# print length of the matrices list

print("Final list of neighbourhoods contains " + str(len(final_list)) + " elements\n")

print(time.time() - start_all)

In [None]:
# save matrices -> intermediate/dataset_name/_sparse_neighbour_matrices_list

file_to_store = open(os.path.join(intermediate_files_folder,analysis_name + '_sparse_neighbour_matrices_list') , "wb")
pickle.dump(final_list, file_to_store)
file_to_store.close()

## build one matrix containing all neighbourhoods

In [None]:
# build matrix with neighbourhoods - all beads across all structures -> to be used for products
# values in  neigbhours correspond to beads ids
# keeps ids

neigbhours = np.zeros((number_of_structures,number_of_beads_per_structure,k),dtype = int)

# iterate over list of neigbours in structures
for x in range(number_of_structures):
    
    # pick neighbours from list
    structure_x = final_list[x][0]
    # convert to dense form
    structure_x_dense = sparse.csc_matrix.todense(structure_x)
    # put in the neighbours matrix
    neigbhours[x] = structure_x_dense


## prepare frequency matrix and binary matrix

In [None]:
# here goes to indexes

frequencies = np.zeros((number_of_beads_per_structure,number_of_beads_per_structure),dtype = int)

# prepare coocurence matrix based on neigbhours matrix

#iterate over beads dimension in neighbourhood matrix
for x in range(number_of_beads_per_structure):

    # get neighbourhood for bead x across all structures
    bead_x_all_structures = neigbhours[:,x,:]
    # get occurences of beads in bead x neighbourhood across all structures
    val, counts = np.unique(bead_x_all_structures,return_counts=True)

    # assign values to frequencies matrix
    for i in range(len(val)):
        if val[i] != 0:
            frequencies[x,val[i]-1] = counts[i] # i - 1 becuese in neigbhours matrix values are correspondingto bead ids, whereas in frequencies it goes by indexes




In [None]:
# binary matrix <- used in module 3_2 to identify clusters, generate from frequencies matrix, if values is above treshold binary matirx get 1, otherwise 0

binary = np.zeros((number_of_beads_per_structure,number_of_beads_per_structure),dtype = int)
for x in range(number_of_beads_per_structure):
    for y in range(number_of_beads_per_structure):
        if frequencies[x,y] >= structures_fraction:
            binary[x,y]=1



## convert neighbourhoods to products of primes

In [None]:
# get product

primes = primes_array

# bead numbers         [1,2,3,4,5]
# primes_array indexes [0,1,2,3,4]
# primes               [2,3,5,7,11]


# matrix to hold products

products = np.zeros((number_of_structures,number_of_beads_per_structure),dtype = object)


# iterate over structures



for x in range(len(final_list)): # structures

    # load structure x
    structure_x = final_list[x][0]
    # convert from sparse to dense
    structure_x_dense = sparse.csc_matrix.todense(structure_x)
    # get shape for conversion to primes
    shape = np.shape(structure_x_dense)
    second_dimension = shape[1]
    # matirx for primes
    beads_as_primes = np.zeros((number_of_beads_per_structure,second_dimension),dtype = object)

    # for each bead
    for i in range(number_of_beads_per_structure):
        # for each of the neighbours
        for j in range(second_dimension):
            bead = structure_x_dense[i,j]
            # if 0 than no neigbour
            if bead == 0:
                # 1 multiplying by 1 will not change product
                beads_as_primes[i,j] = 1
            else:
                beads_as_primes[i,j] = primes[bead - 1] # as below
                # bead numbers         [1,2,3,4,5]
                # primes_array indexes [0,1,2,3,4]
                # primes               [2,3,5,7,11]

    # obtain neighbourhoods as products
    product_str_x = np.prod(beads_as_primes,axis = 1)
    # assign to products matrix
    products[x] = product_str_x
    if x%50 == 0:
        print(x)
    

# save full product

np.save(product_folder + '/product_full', products)

print("full product saved")

# save products by beads (for module 3_2)
for b in range(number_of_beads_per_structure):
    prod_bead = products[:,b]
    np.save(product_folder + '/prod_' + str(b+1),prod_bead)

## prepare figurs and save files

In [None]:
# prepare figure for frequencies

sns.set_theme()
fig, ax = plt.subplots(figsize=(20,15))
ax = sns.heatmap(frequencies)
fig.savefig(os.path.join(figures_folder,analysis_name + '_frequencies.png'))

In [None]:
# prepare figure for binaryy

sns.set_theme()
fig, ax = plt.subplots(figsize=(20,15))
ax = sns.heatmap(binary)
fig.savefig(os.path.join(figures_folder,analysis_name + '_binary.png'))

In [None]:
#  SAVE FILES frequencies and binaries

np.save(os.path.join(intermediate_files_folder,analysis_name + '_binary'),binary)
np.save(os.path.join(intermediate_files_folder,analysis_name + '_frequencies'),frequencies)