In [None]:
#This is MM-Integrator (MNase-seq & Micro-C Integrator)
#This script allows to get a file with numbers of contacts between nucleosomes in a chosen locus
#Get it from a file with nucleosomes positioning and a micro-c cooler with nucleosomal (200 bp) resolution
#This script was written for integrating https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1004653 and https://data.4dnucleome.org/experiment-set-replicates/4DNES14CNC1I/
#For getting a nucleosomes contacts map of mouse genome locus Igf2-H19
------------
#To start with, upload your micro-c data table for chosen locus and cleanse it of useless columns
import pandas as pd
contacts = pd.read_table('Igf2_H19', sep=r'\t', engine='python', names=['chrom1_name', 'chrom1_start_locus','chrom1_end_locus', 'chrom2_name', 'chrom2_start_locus','chrom2_end_locus','number_of_contacts'], index_col=False)
del contacts['chrom1_name']
del contacts['chrom2_name']
contacts.head()

In [None]:
#Here we make special columns -- nucl_1_center and nucl_2_center in order to map later nucleosomes positions on cooler bins
#Also we introduce a column named nucl_1_name
#A serial number of a nucleosome (its name) mapped into corresponding bin will be here
contacts['nucl_1_center'] = (contacts['chrom1_start_locus']+contacts['chrom1_end_locus'])/2
contacts['nucl_2_center'] = (contacts['chrom2_start_locus']+contacts['chrom2_end_locus'])/2
contacts['nucl_1_center'] = contacts['nucl_1_center'].astype(int)
contacts['nucl_2_center'] = contacts['nucl_2_center'].astype(int)
contacts.drop(['chrom1_start_locus','chrom1_end_locus','chrom2_start_locus','chrom2_end_locus'], axis=1, inplace=True)
contacts['nucl_1_name'] = ''
contacts.head()

In [None]:
#Now let's deal with nucleosomes positionung file
nucleosomes = pd.read_table('igfnucl.txt', sep=r'\t', engine='python', names=['chrom_name', 'nucl_start_locus','nucl_end_locus', 'mnase_protected'], index_col=False)
del nucleosomes['chrom_name']
del nucleosomes['mnase_protected']
nucleosomes = nucleosomes.sort_values(['nucl_start_locus'])
nucleosomes.head()

In [None]:
#Our nucleosomes positions data was rather confusing
#It contained different variants of start and end positions of nucleosomes
#There were also some overlappings between nucleosomes areas
#So it was required to unify data
#Thus a quite brut approach was developed:
#In order to unify data we filter out all positions with same start but different ends
#The mean of ends coordinates is now a consensual end for all positions with equal starts
nucleosomes = nucleosomes.groupby('nucl_start_locus', as_index=False)['nucl_end_locus'].mean().astype(int)

In [None]:
#Now let's finally unify nucleosomes positions data via handling the overlapping issue
#To do it: 
#1.Delete all positions which starts are no further than 200 bp from the first position start
#Repeat it for the fresh second position 
#and after -- iteratively -- for each next (amongst the remaining) position

compared_nucl_start = 0
for indexus, row in nucleosomes.iterrows():
    if ((nucleosomes.loc[indexus]['nucl_start_locus']-nucleosomes.loc[compared_nucl_start]['nucl_start_locus'])<200)and((nucleosomes.loc[indexus]['nucl_start_locus']-nucleosomes.loc[compared_nucl_start]['nucl_start_locus'])!=0):
        nucleosomes.drop(indexus, inplace=True)
    elif nucleosomes.loc[indexus]['nucl_start_locus']-nucleosomes.loc[compared_nucl_start]['nucl_start_locus'] ==0:
        continue
    else:
        compared_nucl_start = indexus
        

In [None]:
#Here we create a column with nucleosomes serial numbers serving as names
import numpy as np
nucleosomes['nucl_names'] = np.arange(len(nucleosomes))
nucleosomes['nucl_names'] = nucleosomes['nucl_names'].apply(lambda x: 'nucl_'+str(x))
nucleosomes.head()

In [None]:
#Finally, we integrate data
#For each cooler bin (first/left of the pair) we check what nucleosome it corresponds to
#And add name of this nucleosome into the already prepared contacts column -- nucl_1_name
for index, row in contacts.iterrows():
    try:
        contacts.at[index,'nucl_1_name'] = nucleosomes[(nucleosomes['nucl_start_locus'] < contacts.loc[index]['nucl_1_center'])&(nucleosomes['nucl_end_locus'] > contacts.loc[index]['nucl_1_center'])]['nucl_names'].iat[0]
    except IndexError:
        continue

In [None]:
#Do the same for each second/right cooler bin 
for index, row in contacts.iterrows():
    try:
        contacts.at[index,'nucl_2_name'] = nucleosomes[(nucleosomes['nucl_start_locus'] < contacts.loc[index]['nucl_2_center'])&(nucleosomes['nucl_end_locus'] > contacts.loc[index]['nucl_2_center'])]['nucl_names'].iat[0]
    except IndexError:
        continue

In [None]:
#Now we have a coveted data table -- nucleosomes contacts
#It seems like the following reindexation makes table more convenient for visual analysis
#However, it is not (obviously) obligatory 
contacts = contacts.reindex(columns=['nucl_1_center', 'nucl_2_center','nucl_1_name','nucl_2_name','number_of_contacts'])

In [None]:
#Now we can save this table as csv file
contacts.to_csv('integrated_igf2_locus.csv',index=False)

In [None]:
#It will be wise to save also the nucleosomes positions (with their names) table
#As it contains information about regions occupied by nucleosomes
#And the contacts table doesn't
nucleosomes.to_csv('igf2_locus_nucleosomes.csv',index=False)