# Biclustering CC
Implementation proposed by Cheng & Church in Biclustering of Expression Data

In [10]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import consensus_score

In [6]:
%%latex
Define Mean Squeae Residue (MSR)$H(I,J)$:
$$H(I,J) = \frac{1}{|I||J|} \sum_{i \in I} \sum_{j \in J} (a_{ij} - a_{Ij} - a_{iJ} + a_{IJ})^2$$
where:
$$a_{iJ} = \frac{1}{|J|} \sum_{j \in J} a_{ij}$$
$$a_{Ij} = \frac{1}{|I|} \sum_{i \in I} a_{ij}$$
$$a_{IJ} = \frac{1}{|I||J|} \sum_{i \in I, j \in J} a_{ij}$$


<IPython.core.display.Latex object>

In [26]:
class MSR(object):
    def __init__(self,data):
        self.data=data
        self.n, self.m = data.shape
        self.aiJ = np.mean(data,axis=1)
        self.aIj = np.mean(data,axis=0)
        self.aIJ = np.mean(data)
        self._H = None
        self._HiJ = None
        self._HIj = None
    
    @property
    def H(self):
        if self._H is None:
            print ("computing MSR ...")
            self._H = self._compute_H()
            print ("MSR VALUE " + str(self._H))
        return self._H
        
    @property
    def HiJ(self):
        if self._HiJ is None:
            self._HiJ = self._compute_HiJ()
        return self._HiJ
    
    @property
    def HIj(self):
        if self._HIj is None:
            self._HIj = self._compute_HIj()
        return self._HIj
    
    def _compute_H(self):
        H = 0
        for i in range(self.n):
            for j in range(self.m):
                H  += (self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ ) ** 2
        H *= 1.0/(self.n + self.m)       
        return H
    
    def _compute_HiJ(self):
        HiJ = np.zeros(self.n)
        for i in range(self.n):
            for j in range(self.m):
                HiJ[i] += ( self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ )**2
        HiJ *= 1.0/self.m
        return HiJ


    def _compute_HIj(self):
        HIj = np.zeros(self.m)
        for j in range(self.m):
            for i in range(self.n):
                HIj[j] += ( self.data[i,j] - self.aIj[j] - self.aiJ[i] + self.aIJ )**2
        HIj *= 1.0/self.n
        return HIj


        

In [31]:
import random
data  = np.random.random((5, 5))
data

array([[ 0.03852874,  0.36069209,  0.72373909,  0.04372628,  0.68043933],
       [ 0.12771973,  0.89955523,  0.10638015,  0.64355391,  0.05493306],
       [ 0.02884842,  0.32832336,  0.08935252,  0.62375808,  0.8367072 ],
       [ 0.91723187,  0.56400323,  0.98114488,  0.35462762,  0.61136659],
       [ 0.04844373,  0.75568786,  0.76464619,  0.80134422,  0.43680953]])

In [34]:
msr = MSR(data)
print ("aIJ= " + str(msr.aIJ))
print ("H= " + str(msr.H))
print ("aiJ= " + str(msr.aiJ))
print ("aIj= " + str(msr.aIj))

aIJ= 0.472862516555
computing MSR ...
MSR VALUE 0.18029325033
H= 0.18029325033
aiJ= [ 0.36942511  0.36642842  0.38139792  0.68567484  0.56138631]
aIj= [ 0.2321545   0.58165235  0.53305257  0.49340202  0.52405114]


In [35]:
def remove_unique_nodes(data, delta, I=None, J=None):
    it = 1
    
    if I is None:
        I = np.arange(len(data))
    
    if J is None:    
        J = np.arange(len(data[0]))
        
    while True:
        it += 1
        
        msr = MSR(data[I][:,J])
        
        if msr.H <delta:
            break
            
        if len(I) == 1 or len(J) == 1:
            break
        
        row_idx_to_remove = np.argmax(msr.HiJ)
        col_idx_to_remove = np.argmax(msr.HIj)
        
        if msr.HiJ[row_idx_to_remove] > msr.HIj[col_idx_to_remove]:
            print("removing row " + str(row_idx_to_remove))
            I = np.delete(I,row_idx_to_remove)
            
        else:
            pri
        
        
            
        
        
        
    