# Discriminative Biclustering Algorithm 
Proposed by Odibat & Reddy, 2014 in **Efficient mining of discriminative co-clusters from gene
expression data**

In [2]:
%load_ext pycodestyle_magic

In [3]:
%matplotlib inline
import numpy as np
import math
from matplotlib import pyplot as plt
from sklearn.metrics import consensus_score

In [4]:
# !pip install pycodestyle
# !pip install pycodestyle_magic

### Definition 1 - Coherence Measure H

In [5]:
#%pycodestyle
class CoherenceMeasure(object):
    def __init__(self, data):
        self.data = data
        self.n, self.m = data.shape
        self.xiJ = np.mean(data, axis=1)
        self.xIj = np.mean(data, axis=0)
        self.xIJ = np.mean(data)
        self._H = None

    @property
    def H(self):
        if self._H is None:
            print("Computing coherence measure")
            self._H = self._compute_H()
            print("H value: " + str(self._H))
        return self._H
    
    def _compute_H(self):
        H = 0
        for i in range(self.n):
            for j in range(self.m):
                H += (self.data[i, j] - self.xIj[j] -
                      self.xiJ[i] + self.xIJ)**2
        H *= 1.0/math.fabs(self.m*self.n)
        H = 1 - H
        return H

#### Loading test data for Coherence Measure

In [6]:
import random
data = np.random.random((50, 50))
print(data)

[[ 0.99292617  0.46054783  0.45334732 ...,  0.54637177  0.34975704
   0.03320745]
 [ 0.4967808   0.92398397  0.850448   ...,  0.8214379   0.99155833
   0.44774937]
 [ 0.60443283  0.04417372  0.73247088 ...,  0.66486059  0.44419794
   0.35091846]
 ..., 
 [ 0.83547912  0.1982444   0.91010615 ...,  0.63991897  0.3788043
   0.3138958 ]
 [ 0.36928451  0.08344092  0.87638898 ...,  0.9969633   0.40814035
   0.36725078]
 [ 0.48100482  0.85805674  0.31450355 ...,  0.10528504  0.85085786
   0.72156388]]


In [7]:
# Testing Coherence
coherence_measure = CoherenceMeasure(data)
print("H = " + str(coherence_measure.H))

Computing coherence measure
H value: 0.920836263328
H = 0.920836263328


### Definition 2 - Positive and negative correlations

In [8]:
# input: rows x and y and J columns
# output: positive and negative correlations


class PositiveNegativeCorrelation(object):
    def __init__(self, x, y, J):
        self._x = x
        self._y = y
        self._J = J
        self._x_mean = np.mean(x)
        self._y_mean = np.mean(y)
        self._H_pos = None
        self._H_neg = None

    @property
    def H_pos(self):
        if self._H_pos is None:
            # print("Computing H positive...")
            self._H_pos = self._compute_H_pos()
            # print("H positive value: " + str(self._H_pos))
        return self._H_pos

    @property
    def H_neg(self):
        if self._H_neg is None:
            # print("Computing H negative...")
            self._H_neg = self._compute_H_neg()
            # print("H negative value: " + str(self._H_neg))
        return self._H_neg

    def _compute_H_pos(self):
        H_pos = 0
        for j in range(self._J):
            aux = (((self._x[j] - self._x_mean) -
                    (self._y[j] - self._y_mean))/2.0)**2
            H_pos += aux
        H_pos *= 1.0/math.fabs(self._J)
        H_pos = 1 - H_pos
        return H_pos

    def _compute_H_neg(self):
        H_neg = 0
        for j in range(self._J):
            aux = (((self._x[j] - self._x_mean) +
                    (self._y[j] - self._y_mean))/2.0)**2
            H_neg += aux
        H_neg *= 1.0/math.fabs(self._J)
        H_neg = 1 - H_neg
        return H_neg

#### Loading test data for positive and negative correlation

In [9]:
x = np.random.random((5))
y = np.random.random((5))
J = 5
print("Row x " + str(x))
print("Row y " + str(y))
print("J value " + str(J))

Row x [ 0.35116499  0.61168243  0.54373779  0.24584837  0.53652892]
Row y [ 0.44920898  0.25643547  0.79416032  0.92575588  0.77734419]
J value 5


In [10]:
# Testing correlation
positive_negative_correlation = PositiveNegativeCorrelation(x,y,J)
print("H positive " + str(positive_negative_correlation.H_pos))
print()
print("H negative " + str(positive_negative_correlation.H_neg))

H positive 0.972413371625

H negative 0.987450189092


### Definition 3 - Pair-based coherence

In [11]:
#%%pycodestyle

# input: co-cluster X of I rows and J columns
# output: paired-based coherence


class PairBasedCoherence(object):
    def __init__(self, X):
        self._X = X
        self._I, self._J = X.shape
        self._HP = None

    @property
    def HP(self):
        if self._HP is None:
            print("Calculating Pair based coherence..")
            self._HP = self._compute_HP()
            print("Paired based coherence value: " + str(self._HP))
        return self._HP

    def _compute_HP(self):
        HP = 0
        for i in range(self._I):
            for j in range(i+1, self._I):
                x = self._X[i]
                y = self._X[j]
                correlation = PositiveNegativeCorrelation(x, y,self._J)
                H0 = correlation.H_pos + correlation.H_neg
                HP += H0
        HP *= math.fabs(2.0)/(math.fabs(self._I)*(math.fabs(self._I)-1))
        return HP

#### Loading test data for pair-based coherence

In [12]:
data = np.random.random((50, 50))
print(data)

[[ 0.94837915  0.4039216   0.13979349 ...,  0.60563395  0.35179364
   0.63483164]
 [ 0.2077369   0.32980782  0.9139554  ...,  0.86574175  0.44252234
   0.04993265]
 [ 0.44478214  0.82917338  0.65234929 ...,  0.53543989  0.79299901
   0.40062117]
 ..., 
 [ 0.49583671  0.6313128   0.49638936 ...,  0.00791874  0.10613412
   0.64595892]
 [ 0.16400634  0.26954169  0.48190763 ...,  0.58877044  0.79740665
   0.73805452]
 [ 0.97557131  0.76047919  0.26485685 ...,  0.41993746  0.09796634
   0.32608413]]


In [13]:
pair_based_coherence = PairBasedCoherence(data)
print("H value " + str(pair_based_coherence.HP))

Calculating Pair based coherence..
Paired based coherence value: 1.91888311697
H value 1.91888311697


### Coherence for a new z in in X

In [14]:
%%latex
Define H for a new term 'z' in X
$$
H_{1}(I,J,X,z) =H_{0}(I,J,X) \cdot \frac{(I-1)}{(I+1)} + \frac{|2|}{(I)(I+1)} \cdot \sum_{x \epsilon X} {h(x,z,J)}
$$

<IPython.core.display.Latex object>

### RAPOOC

This algorithm is proposed to efficiently extract the most coherent and large co-clusters that area arbitrarily positioned in the data matrix.



#### Algorithm 1 RAPOOC (D,k,l,K)
Input: Data matrix D, number of row clusters (k), number of column clusters (l), number of optimized co-clusters (K)

Output: A set of K co-clusters({X})

In [22]:
%%pycodestyle


class Rapooc(object):
    def __init__(self, D, k, l, K):
        self._D = D
        self._k = k
        self._l = l
        self._K = K
        self._rho = np.ones(D.shape[0])
        self._gamma = np.ones(D.shape[1])

    def initialize(self):
        i = 1
        j = 1
        while (i < k or j < l):
            if i < k:
                i += 1
                new_alpha = self._argmin_H_(self._rho, self._gamma)
                self._bisect_row_partitions_()

    def _argmin_H_(self, row_co_cluster, col_co_cluster):
        h_min = math.inf
        min_cocluster = 1
        for i in range(1, np.max(row_co_cluster)):
            coherence = PairBasedCoherence(
                self._D[np.where(row_co_cluster == i)]).HP
            if (coherence <= h_min):
                h_min = coherence
                min_cocluster = i
        return i

In [69]:
a=np.array([1,1,1,1,2,2,2,2,3,3])
print(np.max(a))
a[[0,1]]

3


array([1, 1])

In [68]:
import math
test = math.inf
print(test)

inf
