In this Notebook we will cluster the ensemble of models to check how variable or similar they are, and in our case, to select the first cluster for further analysis

# Parameters to modify

In [1]:
# Scale of the experiment. If not changed, is 0.01 nm/bp
scale = 0.01
# Parameter for the clustering. Factor for equivalent positions
fact = 0.8
# Maximum number of models to be used. From ~1500 done, we will keep the best 1000
modelsKeep = 1000
# n CPU to use in clustering
n_cpus = 16
# Base path where the folder branch is stored
basePath = '/home/julen/TADdyn/SparseDataModelling/'# Wether to store (True) or not (False) the inividual model files
# of clusters with more than 10 models
store_cmm = False


# Libraries and functions 

In [1]:
from pytadbit import load_structuralmodels
import itertools
import numpy as np
import cPickle as pickle
import sys

# Run 

## Define needed paths 

In [3]:
# Output path for pickles and plots
outdata = basePath + 'outData/modelAnalysis/clustering/'
outplot = basePath + 'outPlot/modelAnalysis/clustering/'

# create folders
! mkdir -p {outdata}
! mkdir -p {outplot}

## Import additional libraries 

In [4]:
sys.path.append(basePath + 'code')
import fileHandling

## Cluster

This code will load the model files paths

In [5]:
## Path were the models are stored
cells, regionsAll, models = fileHandling.getModelsPaths(basePath, ending='models')

In [6]:
models

{'Ery': {'b-globin': '/home/julen/TADdyn/TADdyn_tutorial/models/Ery/b-globin/Ery_b-globin_C200.0L0.0U0.0M300Res5000.models'},
 'Mon': {'b-globin': '/home/julen/TADdyn/TADdyn_tutorial/models/Mon/b-globin/Mon_b-globin_C200.0L-0.5U0.0M300Res5000.models'},
 'nCD4': {'b-globin': '/home/julen/TADdyn/TADdyn_tutorial/models/nCD4/b-globin/nCD4_b-globin_C200.0L0.0U0.0M300Res5000.models'}}

This code will cluster the models from each of the ensembles

In [8]:
allClusters = {}
for cell in models:
    for regi in models[cell]:
        fi = models[cell][regi]
        print(fi)
        # Define clustering variables in filename
        dcutoff = float(fi.split('_C')[-1].split('L')[0])
        outpath = outdata + '%s/%s' %(cell, regi)
        # If there is no clustering info for these models
        if (not '%s_%s' %(cell, regi) in allClusters) or (store_cmm == True):
            # load model
            models1 = load_structuralmodels(fi)
            # keep the amount of selected models
            models1.define_best_models(min(len(models1), modelsKeep))
            # clusterize
            models1.align_models(in_place=True)

            if not '%s_%s' %(cell, regi) in allClusters:
                models1.cluster_models(fact=fact, n_cpus=n_cpus, dcutoff=dcutoff)
                allClusters['%s_%s' %(cell, regi)] = models1.clusters

            else:
                models1.clusters = allClusters['%s_%s' %(cell, regi)]


            # If we want to store cmm files of clusters with more than 10 models
            if store_cmm == True:
                # If this models where not already stored
                if not os.path.isdir(outpath):
                    # create cutoff folder
                    ! mkdir -p {outpath}

                    # store cmm files
                    for clust in models1.clusters.keys():
                        if len(models1.clusters[clust]) > 10:
                            outdir2 = outpath + "/" + str(clust)
                            ! mkdir -p {outdir2}
                            # store cmm
                            modelsClust = [models1[str(m)]['index'] for m in models1.clusters[clust]]
                            for i in modelsClust:
                                models1.write_cmm(outdir2, cluster=clust, model_num=i)

/home/julen/TADdyn/TADdyn_tutorial/models/nCD4/b-globin/nCD4_b-globin_C200.0L0.0U0.0M300Res5000.models
Number of singletons excluded from clustering: 0 (total singletons: 0)
Total number of clusters: 3
   Cluster #1 has 502 models [top model: 92300]
   Cluster #2 has 492 models [top model: 868328]
   Cluster #3 has 6 models [top model: 237553]

/home/julen/TADdyn/TADdyn_tutorial/models/Ery/b-globin/Ery_b-globin_C200.0L0.0U0.0M300Res5000.models
Number of singletons excluded from clustering: 0 (total singletons: 0)
Total number of clusters: 3
   Cluster #1 has 501 models [top model: 22353]
   Cluster #2 has 494 models [top model: 515201]
   Cluster #3 has 5 models [top model: 13992]

/home/julen/TADdyn/TADdyn_tutorial/models/Mon/b-globin/Mon_b-globin_C200.0L-0.5U0.0M300Res5000.models
Number of singletons excluded from clustering: 0 (total singletons: 0)
Total number of clusters: 2
   Cluster #1 has 512 models [top model: 496556]
   Cluster #2 has 488 models [top model: 512884]



This code will store the clustering data to be used in the future

In [9]:
if len(allClusters) > 1:
    with open(clustersPickle, "wb") as output_file:
        pickle.dump(allClusters, output_file)