# TS-RAPOOC

### Based on RAPOOC algorithm proposed by Omar Odibat (2014)

- Step 1: Devide rows into two separate parts
- Step 2: Decide which part will be devided among all generated clusters
- Step 3: Devide the chosen cluster in columns or rows, the one that has the highest coherence
- Step 4: Check the number of clusters generated this way, if the number of clusters is reached, break, else return to step 2

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import glob
import multiprocessing
from operator import attrgetter

from metrics import *
from clusterer import *

In [2]:
%%latex
Define H for a new term 'z' in X
$$
H_{1}(I,J,X,z) =H_{0}(I,J,X) \cdot \frac{(I-1)}{(I+1)} + \frac{|2|}{(I)(I+1)} \cdot \sum_{x \epsilon X} {h(x,z,J)}
$$

<IPython.core.display.Latex object>

In [27]:
class ClusterMaps(object):
    def __init__(self, rho=None, gamma=None, H = 0):
        self.gamma = gamma
        self.rho = rho
        self.H = H
    
    def __str__(self):
        return ("rho "+ str(self.rho) + " gamma " + str(self.gamma) + " H " + str(self.H))

In [83]:
class TsRapooc(object):
    def __init__(self, data, n_clusters = 3):
        self._D = np.array(data)
        self._n_clusters = n_clusters
        self._labels = list()
        self._I, self._J = data.shape
        
    @property
    def n_clusters(self):
        return self._n_clusters
    
    @property
    def data(self):
        return self._D
    
    @property
    def labels(self):
        return _labels
    
    def fit(self):
        n_iteration = 0
        n_clusters = 0
        for n_iteration in range(0, self._n_clusters):
            if n_iteration == 0:
                initial_partition = BisectingClusterer(data).fit_rows()
                self._labels.append(ClusterMaps(np.where(initial_partition==0)[0],
                                   np.arange(self._J),
                                    PairBasedCoherence(self._D[np.where(initial_partition==0)[0]]).HP))
         
                
                self._labels.append(ClusterMaps(np.where(initial_partition==1)[0],
                                   np.arange(self._J),
                                   PairBasedCoherence(self._D[np.where(initial_partition==1)[0]]).HP))
            else:
                self._partition()
        
    def _partition(self):
        least_coherent = self._find_least_coherent()
        lowest_cluster = self._D[np.ix_(least_coherent.rho,least_coherent.gamma)]
        print(lowest_cluster.shape)
        self._bisect_bicluster(self._D)
    
    def _find_least_coherent(self):
        return self._labels[self._labels.index(min(self._labels,key=attrgetter('H')))]
    
    def _bisect_bicluster(self,cluster):
        #which of the biclusters maximize coherence
        print(str(BisectingClusterer(cluster).fit_rows()) + " rows")
        
        print(str(BisectingClusterer(cluster).fit_cols()) + " cols")

In [84]:
tsrapooc = TsRapooc(data)
tsrapooc.fit()

(56, 11)
[ 1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.  1.  1.  1.  1.  1.  0.  0.
  0.  0.  1.  0.  1.  0.  1.  1.  1.  1.  1.  0.  0.  1.  1.  0.  1.  0.
  1.  0.  0.  1.  1.  0.  0.  1.  0.  0.  1.  0.  1.  1.  1.  1.  1.  1.
  1.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  1.  1.
  0.  1.  1.  1.  0.  0.  1.  1.  1.  0.  1.  0.  0.  1.  1.  1.  0.  0.
  0.  0.  1.  1.  1.  0.  0.  0.  1.  0.] rows
[ 0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.] cols
(56, 11)
[ 1.  1.  1.  1.  1.  1.  1.  0.  1.  0.  1.  1.  1.  1.  1.  1.  0.  0.
  0.  0.  1.  0.  1.  0.  1.  1.  1.  1.  1.  0.  0.  1.  1.  0.  1.  0.
  1.  0.  0.  1.  1.  0.  0.  1.  0.  0.  1.  0.  1.  1.  1.  1.  1.  1.
  1.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  1.  1.
  0.  1.  1.  1.  0.  0.  1.  1.  1.  0.  1.  0.  0.  1.  1.  1.  0.  0.
  0.  0.  1.  1.  1.  0.  0.  0.  1.  0.] rows
[ 0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.] cols


In [62]:
data = pd.read_csv('TestData/SimulatedDataCoherence/BiclusterA_high.csv',header=None)
# data = (data - 0.5) * 2.0
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.004606,-0.502995,-0.000858,0.499152,0.077278,-0.02756,-0.012346,0.033864,-0.050533,0.079913,-0.050834
std,0.582317,0.029581,0.028747,0.026989,0.525907,0.60042,0.532019,0.583608,0.606005,0.543037,0.530658
min,-0.99975,-0.54965,-0.048164,0.45269,-0.93523,-0.98952,-0.99183,-0.94698,-0.99686,-0.98532,-0.99957
25%,-0.456452,-0.529775,-0.02429,0.479252,-0.3458,-0.579965,-0.415975,-0.44318,-0.626035,-0.348007,-0.50207
50%,-0.117485,-0.501615,0.001816,0.49579,0.089863,0.057297,-0.004626,0.081896,-0.094353,0.182815,-0.081193
75%,0.444335,-0.476805,0.022487,0.521288,0.412845,0.415545,0.44556,0.561928,0.548183,0.566547,0.434685
max,0.9981,-0.45029,0.048232,0.54954,0.99743,0.98323,0.99231,0.97107,0.98155,0.94106,0.91897


In [72]:
bisect = BisectingClusterer(data)

In [73]:
bisect.fit()

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,
        1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,
        1.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,
        0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.])

In [74]:
bisect.fit_cols()

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.])

rho [ 0  1  2  3  4  5  6  7  8  9 10] gamma (array([], dtype=int64),) H 0
