# TS-RAPOOC

### Based on RAPOOC algorithm proposed by Omar Odibat (2014)

- Step 1: Divide rows into two separate parts
- Step 2: Decide which part will be devided among all generated clusters
- Step 3: Divide the chosen cluster in columns or rows, the one that has the highest coherence
- Step 4: Check the number of clusters generated this way, if the number of clusters is reached, break, else return to step 2

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import glob
import multiprocessing
from operator import attrgetter

from metrics import *
from clusterer import *

In [2]:
%%latex
Define H for a new term 'z' in X
$$
H_{1}(I,J,X,z) =H_{0}(I,J,X) \cdot \frac{(I-1)}{(I+1)} + \frac{|2|}{(I)(I+1)} \cdot \sum_{x \epsilon X} {h(x,z,J)}
$$

<IPython.core.display.Latex object>

In [178]:
class ClusterMaps(object):
    def __init__(self, rho=None, gamma=None, H = 0):
        self.gamma = gamma
        self.rho = rho
        self.H = H
    
    def __str__(self):
        return ("rho: \n"+ str(self.rho) + "\n gamma: \n" + str(self.gamma) + "\n H: \n " + str(self.H))

In [189]:
class TsRapooc(object):
    def __init__(self, data, n_clusters = 5):
        self._D = np.array(data)
        self._n_clusters = n_clusters
        self._labels = list()
        self._I, self._J = data.shape
        
    @property
    def n_clusters(self):
        return self._n_clusters
    
    @property
    def data(self):
        return self._D
    
    @property
    def labels(self):
        return _labels
    
    def fit(self):
        n_iteration = 0
        n_clusters = 0
        for n_iteration in range(0, self._n_clusters):
            if n_iteration == 0:
                initial_partition = BisectingClusterer(data).fit_rows()
                self._labels.append(ClusterMaps(np.where(initial_partition==0)[0],
                                   np.arange(self._J),
                                    PairBasedCoherence(self._D[np.where(initial_partition==0)[0]]).HP))
         
                
                self._labels.append(ClusterMaps(np.where(initial_partition==1)[0],
                                   np.arange(self._J),
                                   PairBasedCoherence(self._D[np.where(initial_partition==1)[0]]).HP))
            else:
                self._partition()
        
    def _partition(self):
        least_coherent, least_coherent_index = self._find_least_coherent()
        lowest_cluster = self._D[np.ix_(least_coherent.rho,least_coherent.gamma)]
        self._bisect_bicluster(lowest_cluster, least_coherent,least_coherent_index)
    
    def _find_least_coherent(self):
        return self._labels[self._labels.index(min(self._labels,key=attrgetter('H')))], self._labels.index(min(self._labels,key=attrgetter('H')))
    
    def _bisect_bicluster(self,cluster, cluster_node, node_index):
        #which of the biclusters maximize coherence
        rows_map = BisectingClusterer(cluster).fit_rows()
        cols_map = BisectingClusterer(cluster).fit_cols()
        
        flg_alls= 'none'
        
        if(np.all(cols_map) or np.all(rows_map)):
            if(np.all(rows_map)):
                flg_alls = 'cols'
            else:
                flg_alls = 'rows'
        
        if(flg_alls == 'none'):
            
            # find the coherences, the ones which raises the average coherences are the ones that are divided
            cols_coherence_1 = PairBasedCoherence(self._D[np.ix_(cluster_node.rho,cluster_node.gamma[np.flatnonzero(cols_map == 0)])]).HP
            cols_coherence_2 = PairBasedCoherence(self._D[np.ix_(cluster_node.rho,cluster_node.gamma[np.flatnonzero(cols_map == 1)])]).HP
            rows_coherence_1 = PairBasedCoherence(self._D[np.ix_(cluster_node.rho[np.flatnonzero(rows_map == 0)],cluster_node.gamma)]).HP
            rows_coherence_2 = PairBasedCoherence(self._D[np.ix_(cluster_node.rho[np.flatnonzero(rows_map == 1)],cluster_node.gamma)]).HP
            l = [cols_coherence_1, cols_coherence_2, rows_coherence_1, rows_coherence_2]
            min_value = min(l)
            if (min_value == cols_coherence_1 or min_value == cols_coherence_2):
                print('cols')
            elif (min_value == rows_coherence_1 or min_value == rows_coherence_2):
                print('rows_1')
        elif (flg_alls == 'cols'):
            print('cols')
            print('cols 0')
            cols_coherence_1 = PairBasedCoherence(self._D[np.ix_(cluster_node.rho,cluster_node.gamma[np.flatnonzero(cols_map == 0)])]).HP
            cols_coherence_2 = PairBasedCoherence(self._D[np.ix_(cluster_node.rho,cluster_node.gamma[np.flatnonzero(cols_map == 1)])]).HP
            col_cluster_1 = ClusterMaps(cluster_node.rho
                                        , cluster_node.gamma[np.flatnonzero(cols_map == 0)], rows_coherence_1)
            self._labels[node_index] = col_cluster_1
            col_cluster_2 = ClusterMaps(cluster_node.rho
                                        , cluster_node.gamma[np.flatnonzero(cols_map == 1)], rows_coherence_2)
            self._labels.append(col_cluster_2)
        else:
            print('####')
            print('rows')
            rows_coherence_1 = PairBasedCoherence(self._D[np.ix_(cluster_node.rho[np.flatnonzero(rows_map == 0)],cluster_node.gamma)]).HP
            rows_coherence_2 = PairBasedCoherence(self._D[np.ix_(cluster_node.rho[np.flatnonzero(rows_map == 1)],cluster_node.gamma)]).HP
            row_cluster_1 = ClusterMaps(cluster_node.rho[np.flatnonzero(rows_map == 0)], cluster_node.gamma, rows_coherence_1)
            print(' rows cluster 1 ' + str(row_cluster_1))
            print('  ')
            self._labels[node_index] = row_cluster_1
            row_cluster_2 = ClusterMaps(cluster_node.rho[np.flatnonzero(rows_map == 1)], cluster_node.gamma, rows_coherence_2)
            print(' rows cluster 2 ' + str(row_cluster_2))
            self._labels.append(row_cluster_2)
            
            

In [190]:
tsrapooc = TsRapooc(data)
tsrapooc.fit()

####
rows
 rows cluster 1 rho: 
[ 0  2  3  4  8 11 13 14 15 25 27 28 31 34 43 49 54 56 58 67 71 73 78 79 80
 85 86 87 92]
 gamma: 
[ 0  1  2  3  4  5  6  7  8  9 10]
 H: 
 0.903326570295
  
 rows cluster 2 rho: 
[ 1  5  6 10 12 20 22 24 26 32 36 39 40 46 48 50 51 52 53 66 70 74 75 82 93
 94 98]
 gamma: 
[ 0  1  2  3  4  5  6  7  8  9 10]
 H: 
 0.910945988966
####
rows
 rows cluster 1 rho: 
[ 7 16 17 21 44 45 47 55 57 59 60 61 62 63 64 65 69 76 83 90 91]
 gamma: 
[ 0  1  2  3  4  5  6  7  8  9 10]
 H: 
 0.905298832907
  
 rows cluster 2 rho: 
[ 9 18 19 23 29 30 33 35 37 38 41 42 68 72 77 81 84 88 89 95 96 97 99]
 gamma: 
[ 0  1  2  3  4  5  6  7  8  9 10]
 H: 
 0.916506834441
####
rows
 rows cluster 1 rho: 
[11 14 28 31 34 54 56 67 71 73 87]
 gamma: 
[ 0  1  2  3  4  5  6  7  8  9 10]
 H: 
 0.905822063748
  
 rows cluster 2 rho: 
[ 0  2  3  4  8 13 15 25 27 43 49 58 78 79 80 85 86 92]
 gamma: 
[ 0  1  2  3  4  5  6  7  8  9 10]
 H: 
 0.916743476894
none
0.934438725161
0.870676754272
0.9

In [None]:
data = pd.read_csv('TestData/SimulatedDataCoherence/BiclusterA_high.csv',header=None)
# data = (data - 0.5) * 2.0
data.describe()

In [72]:
bisect = BisectingClusterer(data)

In [73]:
bisect.fit()

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,
        1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,
        1.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,
        0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.])

In [74]:
bisect.fit_cols()

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.])

In [92]:
np.all([1,1,1,1])

True

rho [ 0  1  2  3  4  5  6  7  8  9 10] gamma (array([], dtype=int64),) H 0
