# Discriminative Biclustering Algorithm 
Proposed by Odibat & Reddy, 2014 in **Efficient mining of discriminative co-clusters from gene
expression data**

In [2]:
%load_ext pycodestyle_magic

In [3]:
%matplotlib inline
import numpy as np
import math
from matplotlib import pyplot as plt
from sklearn.metrics import consensus_score

In [4]:
# !pip install pycodestyle
# !pip install pycodestyle_magic

### Definition 1 - Coherence Measure H

In [5]:
#%pycodestyle
class CoherenceMeasure(object):
    def __init__(self, data):
        self.data = data
        self.n, self.m = data.shape
        self.xiJ = np.mean(data, axis=1)
        self.xIj = np.mean(data, axis=0)
        self.xIJ = np.mean(data)
        self._H = None

    @property
    def H(self):
        if self._H is None:
            print("Computing coherence measure")
            self._H = self._compute_H()
            print("H value: " + str(self._H))
        return self._H
    
    def _compute_H(self):
        H = 0
        for i in range(self.n):
            for j in range(self.m):
                H += (self.data[i, j] - self.xIj[j] -
                      self.xiJ[i] + self.xIJ)**2
        H *= 1.0/math.fabs(self.m*self.n)
        H = 1 - H
        return H

#### Loading test data for Coherence Measure

In [6]:
import random
data = np.random.random((50, 50))
print(data)

[[ 0.99292617  0.46054783  0.45334732 ...,  0.54637177  0.34975704
   0.03320745]
 [ 0.4967808   0.92398397  0.850448   ...,  0.8214379   0.99155833
   0.44774937]
 [ 0.60443283  0.04417372  0.73247088 ...,  0.66486059  0.44419794
   0.35091846]
 ..., 
 [ 0.83547912  0.1982444   0.91010615 ...,  0.63991897  0.3788043
   0.3138958 ]
 [ 0.36928451  0.08344092  0.87638898 ...,  0.9969633   0.40814035
   0.36725078]
 [ 0.48100482  0.85805674  0.31450355 ...,  0.10528504  0.85085786
   0.72156388]]


In [7]:
# Testing Coherence
coherence_measure = CoherenceMeasure(data)
print("H = " + str(coherence_measure.H))

Computing coherence measure
H value: 0.920836263328
H = 0.920836263328


### Definition 2 - Positive and negative correlations

In [8]:
# input: rows x and y and J columns
# output: positive and negative correlations


class PositiveNegativeCorrelation(object):
    def __init__(self, x, y, J):
        self._x = x
        self._y = y
        self._J = J
        self._x_mean = np.mean(x)
        self._y_mean = np.mean(y)
        self._H_pos = None
        self._H_neg = None

    @property
    def H_pos(self):
        if self._H_pos is None:
            # print("Computing H positive...")
            self._H_pos = self._compute_H_pos()
            # print("H positive value: " + str(self._H_pos))
        return self._H_pos

    @property
    def H_neg(self):
        if self._H_neg is None:
            # print("Computing H negative...")
            self._H_neg = self._compute_H_neg()
            # print("H negative value: " + str(self._H_neg))
        return self._H_neg

    def _compute_H_pos(self):
        H_pos = 0
        for j in range(self._J):
            aux = (((self._x[j] - self._x_mean) -
                    (self._y[j] - self._y_mean))/2.0)**2
            H_pos += aux
        H_pos *= 1.0/math.fabs(self._J)
        H_pos = 1 - H_pos
        return H_pos

    def _compute_H_neg(self):
        H_neg = 0
        for j in range(self._J):
            aux = (((self._x[j] - self._x_mean) +
                    (self._y[j] - self._y_mean))/2.0)**2
            H_neg += aux
        H_neg *= 1.0/math.fabs(self._J)
        H_neg = 1 - H_neg
        return H_neg

#### Loading test data for positive and negative correlation

In [9]:
x = np.random.random((5))
y = np.random.random((5))
J = 5
print("Row x " + str(x))
print("Row y " + str(y))
print("J value " + str(J))

Row x [ 0.35116499  0.61168243  0.54373779  0.24584837  0.53652892]
Row y [ 0.44920898  0.25643547  0.79416032  0.92575588  0.77734419]
J value 5


In [10]:
# Testing correlation
positive_negative_correlation = PositiveNegativeCorrelation(x,y,J)
print("H positive " + str(positive_negative_correlation.H_pos))
print()
print("H negative " + str(positive_negative_correlation.H_neg))

H positive 0.972413371625

H negative 0.987450189092


### Definition 3 - Pair-based coherence

In [42]:
#%%pycodestyle

# input: co-cluster X of I rows and J columns
# output: paired-based coherence


class PairBasedCoherence(object):
    def __init__(self, X):
        self._X = X
        self._I, self._J = X.shape
        self._HP = None

    @property
    def HP(self):
        if self._HP is None:
            print("Calculating Pair based coherence..")
            self._HP = self._compute_HP()
            print("Paired based coherence value: " + str(self._HP))
        return self._HP

    def _compute_HP(self):
        HP = 0
        for i in range(self._I):
            for j in range(i+1, self._I):
                if (i==j): 
                    break
                x = self._X[i]
                y = self._X[j]
                correlation = PositiveNegativeCorrelation(x, y,self._J)
                H0 = correlation.H_pos + correlation.H_neg
                # H0 = max(correlation.H_pos,correlation.H_neg)
                HP += H0
        HP *= math.fabs(2.0)/(math.fabs(self._I)*(math.fabs(self._I)-1))
        return HP

#### Loading test data for pair-based coherence

In [40]:
data = np.random.random((50, 50))
print(data)

[[ 0.76021792  0.03879298  0.04400045 ...,  0.95008424  0.2986603
   0.13339968]
 [ 0.36682332  0.84429745  0.58915712 ...,  0.62240116  0.0564276
   0.2538301 ]
 [ 0.62402841  0.71283671  0.66445431 ...,  0.35492918  0.11925989
   0.25919869]
 ..., 
 [ 0.8234815   0.25965969  0.08481996 ...,  0.19385219  0.41014706
   0.08233942]
 [ 0.16795225  0.65235304  0.25173443 ...,  0.96685978  0.11361738
   0.31329659]
 [ 0.12613518  0.42221524  0.31071427 ...,  0.41492516  0.958537
   0.11570195]]


In [41]:
pair_based_coherence = PairBasedCoherence(data)
print("H value " + str(pair_based_coherence.HP))

Calculating Pair based coherence..
Paired based coherence value: 0.96323564439
H value 0.96323564439


### Coherence for a new z in in X

In [14]:
%%latex
Define H for a new term 'z' in X
$$
H_{1}(I,J,X,z) =H_{0}(I,J,X) \cdot \frac{(I-1)}{(I+1)} + \frac{|2|}{(I)(I+1)} \cdot \sum_{x \epsilon X} {h(x,z,J)}
$$

<IPython.core.display.Latex object>

### RAPOOC

This algorithm is proposed to efficiently extract the most coherent and large co-clusters that area arbitrarily positioned in the data matrix.



#### Algorithm 1 RAPOOC (D,k,l,K)
Input: Data matrix D, number of row clusters (k), number of column clusters (l), number of optimized co-clusters (K)

Output: A set of K co-clusters({X})

In [22]:
%%pycodestyle


class Rapooc(object):
    def __init__(self, D, k, l, K):
        self._D = D
        self._k = k
        self._l = l
        self._K = K
        self._rho = np.ones(D.shape[0])
        self._gamma = np.ones(D.shape[1])

    def initialize(self):
        i = 1
        j = 1
        while (i < k or j < l):
            if i < k:
                i += 1
                alpha = self._argmin_H_(self._rho, self._gamma)
                self._bisect_partitions_(self._D[np.where(self._rho==alpha)],np.where(self._rho==alpha),i)
            
            if j < l:
                j += 1
                beta = self._argmin_H(self._rho,self._gaamma)
                beta._bie
                

    def _argmin_H_(self, row_co_cluster, col_co_cluster,option='row'):
        if (option=='row'):
            data = self._D[np.where(row_co_cluster == i)]
        else:
            data = self._D[np.where(row_co_cluster == i)].T
        h_min = math.inf
        min_cocluster = 1
        for i in range(1, np.max(row_co_cluster)):
            coherence = PairBasedCoherence(
                data).HP # self._D[np.where(row_co_cluster == i)]
            if (coherence <= h_min):
                h_min = coherence
                min_cocluster = i
        return i

    def _bisect_row_partitions_ (data,indices,clusterID):
        clusterer = BisectingClusterer(data)
        

In [88]:
# https://github.com/munikarmanish/kmeans/blob/master/kmeans.py


class BisectingClusterer(object):
    def __init__(self, data):
        if data is not None:
            self._data = np.array(data)
            self._I, self._J = self._data.shape
        else:
            print("Empty data")
    
    @property
    def centroids(self):
        return self._centroids

    def fit(self):
        self._centroids = self._compute_centroids_()
        
        bisecting_indices = self._bisect_clusters_(self._centroids)
        return bisecting_indices

    def _compute_centroids_(self):
        max_correlation = 0
        centroids = [0,0]
        for i in range(self._I):
            for j in range(i+1, self._I):
                if (i == j):
                    break
                correlation = PositiveNegativeCorrelation(self._data[i],
                                                          self._data[j],
                                                          self._J).H_neg
                if(correlation > max_correlation):
                    max_correlation = correlation
                    centroids[0] = i
                    centroids[1] = j
        return centroids

    def _bisect_clusters_(self, centroids):
        cluster_indices = np.zeros(self._I)
        for i in range(self._I):
            correlation0 = PositiveNegativeCorrelation(
                self._data[centroids[0]], self._data[i],self._J).H_pos
            correlation1 = PositiveNegativeCorrelation(
                self._data[centroids[1]], self._data[i],self._J).H_pos
            if(correlation0 <= correlation1):
                cluster_indices[i] = 1
        return cluster_indices

In [93]:
data = np.random.random((50, 50))
clusterer = BisectingClusterer(data)
clusterer.fit()

array([ 1.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,
        1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.])

In [101]:
class Cl1:
    def __init__(self, arr):
        self.arr = arr
    def move(self):
        self.arr[0] = 4

In [102]:
test = np.array([1,2,3,4])
cl1 = Cl1(test)
cl1.move()
test

array([4, 2, 3, 4])