# Libraries and functions 

In [1]:
from taddyn import Chromosome, Experiment
import numpy as np
import os
import shutil
from taddyn.modelling.impoptimizer  import IMPoptimizer
import copy
import sys
import datetime

In [2]:
def cToDot(text):
    newText = ''
    for t in text:
        if t == 'c':
            newText += '.'
        elif t == 'n':
            newText += '-'
        else:
            newText += t
    return newText

def dotToC(text):
    newText = ''
    for t in text:
        if t == '.':
            newText += 'c'
        elif t == '-':
            newText += 'n'
        else:
            newText += t
    return newText

def getParamCombi(lowfreq_arange, m_range, c_rangeDot, upfreq_range, 
                 scriptsPath, dcutoff_range, matPath, jobTime, nmodels,
                 tempOut, cpu=False):
    # Create a counter for the different lammps output directories
    allCombi = []
    ## Create the file with the commands to be run in the array
    #fout=open(runfile,'w')
    for x in lowfreq_arange:
        for m in m_range:
            # check if we have a dcutoff small enough compared to maxdist to allow running
            # we allow an overlap of 50nm between maxdist and dcutoff
            if m - float(cToDot(c_rangeDot).split('_')[0]) >= -50:
                for u in upfreq_range:
                    if u >= x:
                        cmd=''
                        cmd+='%s01_NR_optimisation.py -l %s '%(scriptsPath, x)
                        cmd+= '-d %s '%c_rangeDot
                        cmd+= '-m %s '%m
                        cmd+= '-u %s '%u
                        cmd+= '-p %s '%matPath
                        cmd+= '-t %s '%jobTime
                        cmd+= '-nm %s '%str(nmodels)
                        if cpu != False:
                            cmd+= '-cpu %s '%str(cpu)
                        cmd+= '-tp %s'%(tempOut)
                        #cmd+= '\n'
                        allCombi += [cmd]
                        #fout.write(cmd)
    #fout.close()
    return allCombi

def stimateTime(nparticle):
    return 0.001*nparticle**2 + (-0.135*nparticle) + 48.116

# Parameters to modify

In [3]:
## Optimisation parameters
lowfreq_arange = np.arange(-1.0,1.0,0.5)
# Cutoff no more overlap than 50nm with maxdist will be allowed
# should be smaller than maxdist and around "resol * scale * 2"
# scale by default = 0.01
dcutoff_range= [100,400,100]  # start, end + step, step 
m_range= np.arange(200,500,100)
upfreq_range= np.arange(-1,1.0,0.5)
cpus = 8 # number of CPU to use

nmodels = 100
jobTime = '0-08:00:00' # days-hours:minutes:seconds of maximum running time per job

## Data Paths (Location of the base folder downloaded from GitHub)
basePath = '/home/julen/TADdyn/TADdyn_tutorial/'

# Run

## Define additional paths 

In [4]:
scriptsPath = basePath + 'code/modellingScripts/'
tempOut = basePath + 'temporal/'

## Import additional libraries  

In [5]:
sys.path.append(basePath + 'code')
import fileHandling

## Get matrix paths 

In [6]:
matricesLength, regionsAll, matrices = fileHandling.getMatricesPaths(basePath, starting='Matrix')

## First run of data 

### Get combinations of paramteres and run commands 

In [34]:
combinations = {}
for cell in matrices:
    combinations[cell] = {}
    for regi in matrices[cell]:
        matPath = matrices[cell][regi]
        c_rangeDot= dotToC('_'.join(str(j) for j in dcutoff_range))
        combinations[cell][regi] = getParamCombi(lowfreq_arange, m_range, c_rangeDot, upfreq_range, 
                         scriptsPath, dcutoff_range, matPath, jobTime, nmodels,
                         tempOut, cpu=cpus)

### Stimate total modelling time

In [8]:
for regi in regionsAll:
    print('--- %s ---' %regi)
    ncell = len(matrices.keys())
    print('Counting %s cells' %ncell)
    ncombi = len(combinations[cell][regi]) * ncell
    
    # each combination has n models, so we need to multiply
    totalModels = ncombi * nmodels
    
    # Finally we get the total time to do all the models in each region
    timePerModel = stimateTime(matricesLength[regi])
    totalTime = totalModels * timePerModel
    totalTime2 = str(datetime.timedelta(seconds=totalTime))
    
    print('%s models will be computed, in a median stimated time (with 1 CPU) of %s' %(
                                        totalModels, totalTime2))
    
    totalTime2 = str(datetime.timedelta(seconds=totalTime/cpus))
    print("Stimated time with assigned number of %s CPU's: %s" %(cpus,
                                        totalTime2))
    print('')

--- b-globin ---
Counting 3 cells
9000 models will be computed, in a median stimated time (with 1 CPU) of 84 days, 4:46:30
Stimated time with assigned number of 8 CPU's: 10 days, 12:35:48.750000



### Run optimization 

In [18]:
for cell in combinations:
    print('## %s ##' %(cell))
    for regi in combinations[cell]:
        print('--- %s ---' %regi)
        for nc, combi in enumerate(combinations[cell][regi]):
            print('Combination %s' %(nc))
            ! python {combi}

## Ery ##
--- b-globin ---
Combination 0
####
Experiment test:
   resolution        : 5 kb
   TADs              : None
   Hi-C rows         : 942
   normalized        : visibility
   identifier        : GM128Rao
   cell type         : UNKNOWN
   restriction enzyme: UNKNOWN

942
Optimizing 80 particles
  num scale	kbending	maxdist	lowfreq	upfreq	dcutoff	correlation
Performing minimization run...
Performing minimization run...
Performing minimization run...
Performing minimization run...
Performing minimization run...
Performing minimization run...
Performing minimization run...
Performing minimization run...
Performing minimization run...
Performing minimization run...
  1   0.01 	0       	200    	-1     	-1    	200    0.4514
  2   0.01 	0       	200    	-1     	-1    	100    0.7238
All models finished correctly
Combination 1
####
Experiment test:
   resolution        : 5 kb
   TADs              : None
   Hi-C rows         : 942
   normalized        : visibility
   identifier        : G

## Continue re-runing until all models finish 

In [36]:
## we will rerun the models until we finish all them or we reach 
# 10 steps
combinations_t = copy.deepcopy(combinations)
for cell in combinations_t:
    print('## %s ##' %(cell))
    for regi in combinations_t[cell]:
        print('--- %s ---' %regi)
        matPath = matrices[cell][regi]
        nchecks = 0
        while len(combinations_t[cell][regi]) > 0 or nchecks < 10:
            combinations2 = copy.copy(combinations_t[cell][regi])
            combinations_t[cell][regi] = []
            for nc, combi in enumerate(combinations2):
                # get paths and check if the modelling finished
                path = '/'.join(matPath.split('/')[:-1]) + '/'
                jobName = 'LF%sUF%sMdis%s_%sbp' %(combi.split()[2], combi.split()[8], combi.split()[6], 
                                            matPath.split('_')[-1][:-2])
                keep_restart_out_dir = path + 'lammpsSteps/jobArray_%s/' %jobName
                if os.path.isdir(keep_restart_out_dir):
                    print('Combination %s' %(nc))
                    # then it didnt finished
                    combinations_t += [combi]
                    ! python {combi}
            nchecks += 1
        
        

## Mon ##
--- b-globin ---
## Ery ##
--- b-globin ---
## nCD4 ##
--- b-globin ---


## clean temporal folders 

In [8]:
tempFolders = os.listdir(tempOut)

In [10]:
for t in tempFolders:
    print t
    shutil.rmtree(tempOut + t)

sqzn
uyxb
CRil
cVeT
GAZl
xTjj
GbFD
epME
UGbc
GrNS
