<a href="https://colab.research.google.com/github/kundajelab/locusselect/blob/master/examples/regression%20on%20200%20bp%20genome%20bins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from numpy.random import seed
seed(1234)

In [2]:
import numpy as np
from sklearn import manifold
from apricot import FacilityLocationSelection
from apricot import FeatureBasedSelection
import matplotlib
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import umap
from sklearn.manifold import TSNE

In [3]:
from sklearn.decomposition import PCA
import seaborn as sns
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [4]:
import locusselect 
from locusselect.embeddings import * 
from locusselect.deeplift import * 
from locusselect.utils import *
import os
import pickle
from pybedtools import BedTool

Using TensorFlow backend.


In [5]:
def umap_transform(peak_data, n_components=3, random_state=25, metric = 'correlation',n_neighbors=15,min_dist=0.1):
    umap_obj = umap.UMAP(n_components=n_components, random_state=random_state,
        min_dist=min_dist,metric=metric,n_neighbors=n_neighbors)
    try:
        umap_res = umap_obj.fit_transform(peak_data.toarray())
    except:
        umap_res = umap_obj.fit_transform(peak_data)
    return umap_res

In [6]:
def performEmbeddings ( data_list, n_components=2, random_state=2, metric= 'correlation' ) :
    embedding_list = []; 
    for data in data_list :
        embedding_list.append( umap_transform(data, n_components=2, random_state=2,metric=distanceMeasure) )
    return (embedding_list)

In [7]:
def makeGeneralAnnotation ( Ind_list, label_list, k ) :
    num_labels = np.zeros(k)
    
    label_dict = { 'All' : 0 }
    for i in range(len(Ind_list)) :
        num_labels[Ind_list[i]] = i + 1
        label_dict[label_list[i]] = i + 1
    return (num_labels, label_dict)

In [8]:
def plot_embedding_2d(embedding, num_labels=None, label_dict=None, s_dict=None, a_dict=None, cmap=None, norm=None, title='' ):
    fig = plt.figure(figsize=(7,6))
    ax = fig.add_subplot(111)
    if num_labels is None or label_dict is None :
        scatter_res = ax.scatter(embedding[:,0], embedding[:,1], s=4, alpha=1)

    else :
        for label in label_dict.keys() :
            Ind = np.where(num_labels == label_dict[label])[0]
            color = cm.jet(label_dict[label] / float(len(label_dict.keys()) - 1))
            if s_dict is None : 
                dotsize = 4 
            else : 
                dotsize = s_dict[label]
            if a_dict is None : 
                alpha = 1
            else : 
                alpha = a_dict[label]
            scatter_res = ax.scatter(embedding[Ind,0], embedding[Ind,1], s=dotsize, alpha=alpha,c=color, label=label)
        ax.set_xlim( min(embedding[:,0]) - ( ( max(embedding[:,0]) - min(embedding[:,0]) ) * 0.5 ), max(embedding[:,0]) )
        ax.set_ylim( min(embedding[:,1]), max(embedding[:,1]) + ( ( max(embedding[:,1]) - min(embedding[:,1]) ) * 0.5 ) )
        ax.set_title(title)
        ax.legend(loc='upper left',prop={'size': 12})

In [9]:
def selectSubset ( data_matrix, n = None, distance = 'euclidean', initial_subset = None, selection_subset = None ) :
    if selection_subset is not None :
        data_matrix = data_matrix[selection_subset]
    
    if n is None :
        n = np.shape(data_matrix)[0]
    
    if initial_subset is None :
        model = FacilityLocationSelection(n, distance)
    else :
        model = FacilityLocationSelection(n, distance, initial_subset = initial_subset )
    
    Xi = model.fit_transform(data_matrix)
    SubsetInd = model.ranking
    Gains = model.gains
    
    if selection_subset is not None :
        SubsetInd = selection_subset[SubsetInd]
    
    return SubsetInd, Gains

In [10]:
def makeBed(labels) :
    chrNames = np.array([ (coords.decode()).split('_')[0] for coords in labels ] )
    starts = np.array ([ int((coords.decode()).split('_')[1]) for coords in labels ] )
    ends = np.array ([ int((coords.decode()).split('_')[2]) for coords in labels ] )
    
    table = np.zeros(len(chrNames), dtype={'names':('f0', 'f1', 'f2'),'formats':('S5', 'i4', 'i4')})
    table['f0'] = chrNames; table['f1'] = starts+1; table['f2'] = ends;
    return(table)

In [11]:
def intersectBeds ( bed1, bed2, pickLargestOverlap = True ) :
    chrs1 = np.unique(bed1['f0']);  chrs2 = np.unique(bed2['f0'])
    allchrs = np.intersect1d(chrs1,chrs2)
    overlapInd1 = []; overlapInd2 = []; overlapLen = []
    for chr in allchrs :
        Ind1 = np.where(bed1['f0']==chr)[0];    Ind2 = np.where(bed2['f0']==chr)[0]
        midPoint1 = ( bed1['f2'][Ind1] + bed1['f1'][Ind1] ) / 2
        midPoint2 = ( bed2['f2'][Ind2] + bed2['f1'][Ind2] ) / 2
        int1 = bed1['f2'][Ind1] - bed1['f1'][Ind1] + 1
        int2 = bed2['f2'][Ind2] - bed2['f1'][Ind2] + 1
        maxInt = 2 * np.max(np.append(int1,int2))
        for I in range(len(Ind1)) :
            dist = np.abs( midPoint1[I] - midPoint2 )
            closeInd = Ind2[ np.where( dist < maxInt )[0] ]
            if len(closeInd) > 0 :
                for I2 in closeInd :
                    maxStart = max(bed1[Ind1[I]][1], bed2[I2][1])
                    minEnd = min(bed1[Ind1[I]][2], bed2[I2][2])
                    if maxStart < minEnd:
                        overlapInd1.append(Ind1[I])
                        overlapInd2.append(I2)
                        overlapLen.append(minEnd-maxStart)
                        
    overlapInd1 = np.array(overlapInd1); overlapInd2 = np.array(overlapInd2); overlapLen = np.array(overlapLen)
    
    uniqueOverlapInd1 = []; uniqueOverlapInd2 = []; uniqueOverlapLen = []
    if pickLargestOverlap == True :
        uniqueOI1 = np.unique(overlapInd1)
        for uI in uniqueOI1 :
            Ind = np.where(overlapInd1==uI)[0]
            Ord = np.argsort(overlapLen[Ind])
            uniqueOverlapInd1.append(uI); uniqueOverlapInd2.append(overlapInd2[Ind[Ord[-1]]]);
            uniqueOverlapLen.append(overlapLen[Ind[Ord[-1]]])
    
    uniqueOverlapInd1 = np.array(uniqueOverlapInd1); uniqueOverlapInd2 = np.array(uniqueOverlapInd2)
    uniqueOverlapLen = np.array(uniqueOverlapLen)
    
    return (uniqueOverlapInd1,uniqueOverlapInd2,uniqueOverlapLen)

In [12]:
def get_embedding_data_list_fc(bed_list, layer):

    dl_fc_data_list = []
    
    for bed_file in bed_list:
        
        print(bed_file)
        print(layer)
        
        all_fc_embeddings = []

        fc_embedding_args={"input_bed_file":bed_file,
                   "model_hdf5":"/data/locusselect/k562_dnase_dl_models/k562_classification/DNASE.K562.classification.SummitWithin200bpCenter.0",
                   "ref_fasta":"/data/refs/hg19/male.hg19.fa",
                   "center_on_summit":True,
                   "flank":500,
                   "embedding_layer":layer,
                   "expand_dims":True,
                   "threads":20}
        fc_regions, fc_embeddings = compute_embeddings(fc_embedding_args)
        np.savez_compressed(bed_file.split('/')[-1]+'.fc.npz',bed_entries=fc_regions,fc_scores=fc_embeddings)
        
        dl_fc_data_list.append(fc_embeddings)
        
    return dl_fc_data_list

In [13]:
def get_embedding_data_list_conv(bed_list, layer):

    dl_fc_data_list = []
    
    for bed_file in bed_list:
        
        print(bed_file)
        print(layer)
        all_fc_embeddings = []

        fc_embedding_args={"input_bed_file":bed_file,
                   "model_hdf5":"/data/locusselect/k562_dnase_dl_models/k562_classification/DNASE.K562.classification.SummitWithin200bpCenter.0",
                   "ref_fasta":"/data/refs/hg19/male.hg19.fa",
                   "center_on_summit":True,
                   "flank":125,
                   "embedding_layer":layer,
                   "expand_dims":True,
                   "threads":20,
                   "global_pool_on_position":True}
        fc_regions, fc_embeddings = compute_embeddings(fc_embedding_args)
        np.savez_compressed(bed_file.split('/')[-1]+'.fc.npz',bed_entries=fc_regions,fc_scores=fc_embeddings)
        
        dl_fc_data_list.append(fc_embeddings)
        
    return dl_fc_data_list

In [14]:
bed_list = ['/users/soumya.kundu/locusselect/coordinates/'+x for x in os.listdir('/users/soumya.kundu/locusselect/coordinates/') if x.endswith('bed')]
dl_bed_list = [BedTool(x) for x in bed_list]

In [22]:
dl_fc_data_list = get_embedding_data_list_fc(bed_list, -2)

/users/soumya.kundu/locusselect/coordinates/coordinates_5.bed
-2
got model architecture
loaded model weights
loaded model
obtained embedding layer model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 1, 1000, 4)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 1000, 300)      23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 1000, 300)      1200      
_________________________________________________________________
activation_1 (Activation)    (None, 1, 1000, 300)      0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 333, 300)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 1, 333, 200)       

got embeddings
got region labels
/users/soumya.kundu/locusselect/coordinates/coordinates_2.bed
-2
got model architecture
loaded model weights
loaded model
obtained embedding layer model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 1, 1000, 4)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 1000, 300)      23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 1000, 300)      1200      
_________________________________________________________________
activation_1 (Activation)    (None, 1, 1000, 300)      0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 333, 300)       0         
_________________________________________________________________
conv2d_2 (Conv2D)     

got embeddings
got region labels


In [23]:
dl_conv_data_list = get_embedding_data_list_conv(bed_list, 1)

/users/soumya.kundu/locusselect/coordinates/coordinates_5.bed
1
got model architecture
loaded model weights
loaded model
Could not transfer weights for layer:dense_1
obtained embedding layer model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 1, 250, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 250, 300)       23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 250, 300)       1200      
_________________________________________________________________
global_average_pooling2d_1 ( (None, 300)               0         
Total params: 24,300
Trainable params: 23,700
Non-trainable params: 600
_________________________________________________________________
None
created data generator


  new_model = Model(input = model.input, output = flat_embedding)


got embeddings
got region labels
/users/soumya.kundu/locusselect/coordinates/coordinates_1.bed
1
got model architecture
loaded model weights
loaded model
Could not transfer weights for layer:dense_1
obtained embedding layer model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 1, 250, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 250, 300)       23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 250, 300)       1200      
_________________________________________________________________
global_average_pooling2d_2 ( (None, 300)               0         
Total params: 24,300
Trainable params: 23,700
Non-trainable params: 600
_________________________________________________________________
None
created data generator


  new_model = Model(input = model.input, output = flat_embedding)


got embeddings
got region labels
/users/soumya.kundu/locusselect/coordinates/coordinates_3.bed
1
got model architecture
loaded model weights
loaded model
Could not transfer weights for layer:dense_1
obtained embedding layer model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 1, 250, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 250, 300)       23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 250, 300)       1200      
_________________________________________________________________
global_average_pooling2d_3 ( (None, 300)               0         
Total params: 24,300
Trainable params: 23,700
Non-trainable params: 600
_________________________________________________________________
None
created data generator


  new_model = Model(input = model.input, output = flat_embedding)


got embeddings
got region labels
/users/soumya.kundu/locusselect/coordinates/coordinates_2.bed
1
got model architecture
loaded model weights
loaded model
Could not transfer weights for layer:dense_1
obtained embedding layer model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 1, 250, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 250, 300)       23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 250, 300)       1200      
_________________________________________________________________
global_average_pooling2d_4 ( (None, 300)               0         
Total params: 24,300
Trainable params: 23,700
Non-trainable params: 600
_________________________________________________________________
None
created data generator


  new_model = Model(input = model.input, output = flat_embedding)


got embeddings
got region labels
/users/soumya.kundu/locusselect/coordinates/coordinates_0.bed
1
got model architecture
loaded model weights
loaded model
Could not transfer weights for layer:dense_1
obtained embedding layer model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 1, 250, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 250, 300)       23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 250, 300)       1200      
_________________________________________________________________
global_average_pooling2d_5 ( (None, 300)               0         
Total params: 24,300
Trainable params: 23,700
Non-trainable params: 600
_________________________________________________________________
None
created data generator


  new_model = Model(input = model.input, output = flat_embedding)


got embeddings
got region labels
/users/soumya.kundu/locusselect/coordinates/coordinates_4.bed
1
got model architecture
loaded model weights
loaded model
Could not transfer weights for layer:dense_1
obtained embedding layer model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1_input (InputLayer)  (None, 1, 250, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 250, 300)       23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1, 250, 300)       1200      
_________________________________________________________________
global_average_pooling2d_6 ( (None, 300)               0         
Total params: 24,300
Trainable params: 23,700
Non-trainable params: 600
_________________________________________________________________
None
created data generator


  new_model = Model(input = model.input, output = flat_embedding)


got embeddings
got region labels


In [24]:
#with open('class_conv_data_list.pickle', 'wb') as fp:
#    pickle.dump(dl_conv_data_list, fp)

#with open('class_fc_data_list.pickle', 'wb') as fp:
#    pickle.dump(dl_fc_data_list, fp)

In [15]:
with open ('class_conv_data_list.pickle', 'rb') as fp:
    dl_conv_data_list = pickle.load(fp)
    
with open ('class_fc_data_list.pickle', 'rb') as fp:
    dl_fc_data_list = pickle.load(fp)

In [16]:
distanceMeasure = 'correlation'

In [None]:
dl_conv_embedding_list = performEmbeddings(dl_conv_data_list,metric=distanceMeasure)
dl_fc_embedding_list = performEmbeddings(dl_fc_data_list,metric=distanceMeasure)

In [None]:
with open('class_conv_embedding_list.pickle', 'wb') as fp:
    pickle.dump(dl_conv_embedding_list, fp)

with open('class_fc_embedding_list.pickle', 'wb') as fp:
    pickle.dump(dl_fc_embedding_list, fp)

In [None]:
with open ('class_conv_embedding_list.pickle', 'rb') as fp:
    dl_conv_embedding_list = pickle.load(fp)
    
with open ('class_fc_embedding_list.pickle', 'rb') as fp:
    dl_fc_embedding_list = pickle.load(fp)

In [None]:
for embedding, file_ in zip(dl_conv_embedding_list, bed_list):
    print(file_)
    plot_embedding_2d(embedding)

In [None]:
for embedding, file_ in zip(dl_fc_embedding_list, bed_list):
    print(file_)
    plot_embedding_2d(embedding)

In [None]:
#dl_conv_SSIndex_list = []
#for data in dl_conv_data_list:
#    N = np.shape(data)[0]
#    SubsetInds,gains = selectSubset( data, n = N, distance = 'corr' )
#    dl_conv_SSIndex_list.append(SubsetInds)

In [None]:
#dl_fc_SSIndex_list = []
#for data in dl_fc_data_list:
#    N = np.shape(data)[0]
#    SubsetInds,gains = selectSubset( data, n = N, distance = 'corr' )
#    dl_fc_SSIndex_list.append(SubsetInds)

In [None]:
k562_ccREs = BedTool('/data/locusselect/k562_annotations/K562-ccREs.bed10.hg19Lifted.bed')

In [None]:
histone_ccRESSIndex_list = []; histone_ccREIndex_list = []; histone_allccREIndex_list = []; histone_allccRESSIndex_list = [];
for data,bed in zip(dl_fc_data_list,dl_bed_list):
    N = np.shape(data)[0]
    myintersect = bed.intersect(k562_ccREs, u=True, f=0.5, r=True)
    a = [x for x in bed]
    b = [y for y in myintersect]
    c = []
    for ind,val in enumerate(a):
        if val in b:
            c.append(ind)
    ccRESubsetInds,ccREgains = selectSubset( data, n = ( N - len(c) ), distance = 'corr', initial_subset = c )
    histone_ccRESSIndex_list.append(ccRESubsetInds); histone_ccREIndex_list.append(c)

In [None]:
ss_size = 50
size_dict = { 'Subset' : 32, 'All' : 4, 'ccRE': 32 }
alpha_dict = { 'Subset' : 1, 'All' : 0.125, 'ccRE': 1 }

for SSIndex, embedding, ccRESSIndex, ix in zip (histone_ccREIndex_list,dl_fc_embedding_list,histone_ccRESSIndex_list, range(len(histone_ccRESSIndex_list))) :
    num_labels, label_dict = makeGeneralAnnotation ( [ccRESSIndex[0:ss_size], SSIndex], 
        ['Subset', 'ccRE'], k = np.shape(embedding)[0] )
    plot_embedding_2d(embedding, num_labels, label_dict, s_dict = size_dict, a_dict = alpha_dict)

In [None]:
histone_ccRESSIndex_list = []; histone_ccREIndex_list = []; histone_allccREIndex_list = []; histone_allccRESSIndex_list = [];
for data,bed in zip(dl_conv_data_list,dl_bed_list):
    N = np.shape(data)[0]
    myintersect = bed.intersect(k562_ccREs, u=True, f=0.5, r=True)
    a = [x for x in bed]
    b = [y for y in myintersect]
    c = []
    for ind,val in enumerate(a):
        if val in b:
            c.append(ind)
    ccRESubsetInds,ccREgains = selectSubset( data, n = ( N - len(c) ), distance = 'corr', initial_subset = c )
    histone_ccRESSIndex_list.append(ccRESubsetInds); histone_ccREIndex_list.append(c)

In [None]:
ss_size = 50
size_dict = { 'Subset' : 32, 'All' : 4, 'ccRE': 32 }
alpha_dict = { 'Subset' : 1, 'All' : 0.125, 'ccRE': 1 }

for SSIndex, embedding, ccRESSIndex, ix in zip (histone_ccREIndex_list,dl_conv_embedding_list,histone_ccRESSIndex_list, range(len(histone_ccRESSIndex_list))) :
    num_labels, label_dict = makeGeneralAnnotation ( [ccRESSIndex[0:ss_size], SSIndex], 
        ['Subset', 'ccRE'], k = np.shape(embedding)[0] )
    plot_embedding_2d(embedding, num_labels, label_dict, s_dict = size_dict, a_dict = alpha_dict)

In [None]:
dl_conv2_data_list = get_embedding_data_list_conv(bed_list, 9)

In [None]:
with open('class_conv2_data_list.pickle', 'wb') as fp:
    pickle.dump(dl_conv2_data_list, fp)

In [None]:
with open ('class_conv2_data_list.pickle', 'rb') as fp:
    dl_conv2_data_list = pickle.load(fp)

In [None]:
dl_conv2_embedding_list = performEmbeddings(dl_conv2_data_list,metric=distanceMeasure)

In [None]:
with open('class_conv2_embedding_list.pickle', 'wb') as fp:
    pickle.dump(dl_conv2_embedding_list, fp)

In [None]:
with open ('dl_conv2_embedding_list.pickle', 'rb') as fp:
    dl_conv2_embedding_list = pickle.load(fp)

In [None]:
for embedding, file_ in zip(dl_conv2_embedding_list, bed_list):
    print(file_)
    plot_embedding_2d(embedding)

In [None]:
histone_ccRESSIndex_list = []; histone_ccREIndex_list = []; histone_allccREIndex_list = []; histone_allccRESSIndex_list = [];
for data,bed in zip(dl_conv2_data_list,dl_bed_list):
    N = np.shape(data)[0]
    myintersect = bed.intersect(k562_ccREs, u=True, f=0.5, r=True)
    a = [x for x in bed]
    b = [y for y in myintersect]
    c = []
    for ind,val in enumerate(a):
        if val in b:
            c.append(ind)
    ccRESubsetInds,ccREgains = selectSubset( data, n = ( N - len(c) ), distance = 'corr', initial_subset = c )
    histone_ccRESSIndex_list.append(ccRESubsetInds); histone_ccREIndex_list.append(c)

In [None]:
ss_size = 50
size_dict = { 'Subset' : 32, 'All' : 4, 'ccRE': 32 }
alpha_dict = { 'Subset' : 1, 'All' : 0.125, 'ccRE': 1 }

for SSIndex, embedding, ccRESSIndex, ix in zip (histone_ccREIndex_list,dl_conv2_embedding_list,histone_ccRESSIndex_list, range(len(histone_ccRESSIndex_list))) :
    num_labels, label_dict = makeGeneralAnnotation ( [ccRESSIndex[0:ss_size], SSIndex], 
        ['Subset', 'ccRE'], k = np.shape(embedding)[0] )
    plot_embedding_2d(embedding, num_labels, label_dict, s_dict = size_dict, a_dict = alpha_dict)

In [16]:
gkm_list = ['/data/locusselect/gkmexplain/'+x for x in os.listdir('/data/locusselect/gkmexplain/') if x.startswith('gkmexplain.coord.embeddings.')]
gkm_data_list = []
for i in gkm_list:
    gkm_data_list.append(np.loadtxt(i))

In [17]:
gkm_data_list[0].shape

(8001, 5120)

In [18]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
plt.hist(np.sum(np.abs(gkm_data_list[0], axis=0)))
plt.show()

TypeError: 'axis' is an invalid keyword to ufunc 'absolute'

In [None]:
gkm_embedding_list = performEmbeddings(gkm_data_list,metric=distanceMeasure)

In [None]:
histone_ccRESSIndex_list = []; histone_ccREIndex_list = []; histone_allccREIndex_list = []; histone_allccRESSIndex_list = [];
for data,bed in zip(gkm_data_list,dl_bed_list):
    N = np.shape(data)[0]
    myintersect = bed.intersect(k562_ccREs, u=True, f=0.5, r=True)
    a = [x for x in bed]
    b = [y for y in myintersect]
    c = []
    for ind,val in enumerate(a):
        if val in b:
            c.append(ind)
    ccRESubsetInds,ccREgains = selectSubset( data, n = ( N - len(c) ), distance = 'corr', initial_subset = c )
    histone_ccRESSIndex_list.append(ccRESubsetInds); histone_ccREIndex_list.append(c)

In [None]:
ss_size = 50
size_dict = { 'Subset' : 32, 'All' : 4, 'ccRE': 32 }
alpha_dict = { 'Subset' : 1, 'All' : 0.125, 'ccRE': 1 }

for SSIndex, embedding, ccRESSIndex, ix in zip (histone_ccREIndex_list,gkm_embedding_list,histone_ccRESSIndex_list, range(len(histone_ccRESSIndex_list))) :
    num_labels, label_dict = makeGeneralAnnotation ( [ccRESSIndex[0:ss_size], SSIndex], 
        ['Subset', 'ccRE'], k = np.shape(embedding)[0] )
    plot_embedding_2d(embedding, num_labels, label_dict, s_dict = size_dict, a_dict = alpha_dict)