In [1]:
%matplotlib inline
import numpy as np
import math
from matplotlib import pyplot as plt
from sklearn.metrics import consensus_score
import multiprocessing as mp
import pandas as pd
import glob as glob
from functools import reduce

In [2]:
class PositiveNegativeCorrelation(object):
    def __init__(self, x, y, J):
        self._x = x
        self._y = y
        self._J = J
        self._x_mean = np.mean(x)
        self._y_mean = np.mean(y)
        self._H_pos = None
        self._H_neg = None

    @property
    def H_pos(self):
        if self._H_pos is None:
            # print("Computing H positive...")
            self._H_pos = self._compute_H_pos()
            # print("H positive value: " + str(self._H_pos))
        return self._H_pos

    @property
    def H_neg(self):
        if self._H_neg is None:
            # print("Computing H negative...")
            self._H_neg = self._compute_H_neg()
            # print("H negative value: " + str(self._H_neg))
        return self._H_neg

    def _compute_H_pos(self):
        H_pos = 0
        for j in range(self._J):
            aux = (((self._x[j] - self._x_mean) -
                    (self._y[j] - self._y_mean))/2.0)**2
            H_pos += aux
        H_pos *= 1.0/math.fabs(self._J)
        H_pos = 1 - H_pos
        return H_pos

    def _compute_H_neg(self):
        H_neg = 0
        for j in range(self._J):
            aux = (((self._x[j] - self._x_mean) +
                    (self._y[j] - self._y_mean))/2.0)**2
            H_neg += aux
        H_neg *= 1.0/math.fabs(self._J)
        H_neg = 1 - H_neg
        return H_neg

In [3]:
class PairBasedCoherence(object):
    def __init__(self, X):
        self._X = np.array(X)
        self._I, self._J = X.shape
        self._HP = None

    @property
    def HP(self):
        if self._HP is None:
            # print("Calculating Pair based coherence..")
            self._HP = self._compute_HP_()
            # print("Paired based coherence value: " + str(self._HP))
        return self._HP

    def _compute_HP_(self):
        HP = 0
        for i in range(self._I):
            for j in range(i+1, self._I):
                if (i==j): 
                    break
                x = self._X[i]
                y = self._X[j]
                correlation = PositiveNegativeCorrelation(x, y,self._J)
                H0 = correlation.H_pos
                # H0 = max(correlation.H_pos,correlation.H_neg)
                HP += H0
        HP *= math.fabs(2.0)/(math.fabs(self._I)*(math.fabs(self._I)-1)) if self._I > 1 else 0
        
        return HP

In [4]:
%%latex
Define H for a new term 'z' in X
$$
H_{1}(I,J,X,z) =H_{0}(I,J,X) \cdot \frac{(I-1)}{(I+1)} + \frac{|2|}{(I)(I+1)} \cdot \sum_{x \epsilon X} {h(x,z,J)}
$$

<IPython.core.display.Latex object>

In [5]:
data = pd.read_csv('TestData/SimulatedDataCoherence/RandData.csv',header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.029469,0.45006,0.20791,0.052621,0.57575,0.42221,0.59412
1,0.35791,0.62144,0.57962,0.67985,0.12174,0.76694,0.27906
2,0.44764,0.18441,0.8365,1.0,0.33869,0.96056,0.56005
3,0.78175,0.96066,0.040275,0.15462,0.54363,0.70729,0.62836
4,0.3228,0.42936,0.20472,0.65046,0.37645,0.76164,0.52247


In [6]:

class BisectingClusterer(object):
    def __init__(self, data):
        if data is not None:
            self._data = np.array(data)
            self._I, self._J = self._data.shape
        else:
            print("Empty data")
    
    @property
    def centroids(self):
        return self._centroids

    def fit(self):
        self._centroids = self._compute_centroids_()
        bisecting_indices = self._bisect_clusters_(self._centroids)
        return bisecting_indices
    
    def _compute_centroids_(self):
        max_correlation = 0
        centroids = [0,0]
        for i in range(self._I):
            for j in range(i+1, self._I):
                if (i == j):
                    break
                correlation = PositiveNegativeCorrelation(self._data[i],
                                                          self._data[j],
                                                          self._J).H_neg
                if(correlation > max_correlation):
                    max_correlation = correlation
                    centroids[0] = i
                    centroids[1] = j
        return centroids

    def _bisect_clusters_(self, centroids):
        cluster_indices = np.zeros(self._I)
        for i in range(self._I):
            correlation0 = PositiveNegativeCorrelation(
                self._data[centroids[0]], self._data[i],self._J).H_pos
            correlation1 = PositiveNegativeCorrelation(
                self._data[centroids[1]], self._data[i],self._J).H_pos
            if(correlation0 <= correlation1):
                cluster_indices[i] = 1
        return cluster_indices

In [96]:
class Rapooc_test(object):
    def __init__(self, D, k, l, K):
        self._D = np.array(D)
        assert k>0 and l >0, "invalid values, k>0 and l>0"
        self._k = k
        self._l = l
        assert K <= k*l and k>=1, "invalid values, 1<= K <= k*l"
        self._K = K
        self._rho = np.ones(D.shape[0])
        self._gamma = np.ones(D.shape[1])
        self._M, self._N = self._D.shape
        self._rows_h_score = None
        self._cols_h_score = None
            
    @property
    def rho(self):
        return self._rho
    
    @property
    def gamma(self):
        return self._gamma
    
    @property
    def bicluster_h_values(self):
        assert not self._cluster_H_values is None, "H values have not yet been calculated"
        return self._cluster_H_values

    def initialize(self):
        i = 1
        j = 1
        while (i < self._k or j < self._l):
            if i < self._k:
                i += 1
                alpha = self._argmin_H_(self._rho, self._gamma,'row')
                self._bisect_partitions_(self._D[np.where(self._rho==alpha)], self._rho, alpha, i,'row')
            if j < self._l:
                j += 1
                beta = self._argmin_H_(self._rho,self._gamma, 'column')
                self._bisect_partitions_((self._D.T)[np.where(self._gamma==beta)], self._gamma, beta, j,'column')   

    def _argmin_H_(self, row_co_cluster, col_co_cluster,option='row'):
        if (option=='row'):
            data = self._D
        else:
            data = self._D.T
        h_min = math.inf
        min_cocluster = 1
        map_array = np.int64(row_co_cluster if option == 'row' else col_co_cluster)
        max_index_in_map = np.max(map_array)
        for i in range(1,max_index_in_map):
            if (list(map_array).count(i) < 2):
                pass
            else:
                coherence = PairBasedCoherence(
                    data[np.where((row_co_cluster if option == 'row' else col_co_cluster) == i)]).HP 
                if (coherence <= h_min):
                    h_min = coherence
                    min_cocluster = i
        return min_cocluster

    def _bisect_partitions_ (self, data, mapping_array, cluster_to_replace, new_cluster_index,option='row'):
        clusterer = BisectingClusterer(data)
        bisected_map = clusterer.fit()
        bisected_map_index = 0
        for i in range(0,len(mapping_array)):
            if ((self._rho if option == 'row' else self._gamma)[i] == cluster_to_replace):
                if (bisected_map[bisected_map_index] == 1.0):
                    (self._rho if option == 'row' else self._gamma)[i] = new_cluster_index
                bisected_map_index += 1
    
    def _compute_objective_function_(self):
        
        self._cluster_H_values = np.array([ [0.0] * int(max(self._rho) + 1)  
                                           for _ in range(int(max(self._gamma) + 1))])
    
        for i in range(1,int(max(self._rho) + 1)):
            for j in range (1, int(max(self._gamma) + 1)):
                coherence = PairBasedCoherence(self._D[np.ix_(np.where(self._rho == i)[0],
                                                              np.where(self._gamma == j)[0])]).HP
                self._cluster_H_values[i][j] = coherence
        
        h_values_1d = self._cluster_H_values.flatten()
        
        idx_1d = h_values_1d.argsort()[-self._K:]
        
        x_idx, y_idx = np.unravel_index(idx_1d, self._cluster_H_values.shape)
        
        acc=0
        for x, y, in zip(x_idx, y_idx):
            acc += self._cluster_H_values[x][y]
            
        return acc/self._K
    
    def core_co_clustering(self):
        
        n_iterations = 0
        n_repetitions = 0
        of_last_value = 0
        
        while(n_iterations < 20 and n_repetitions < 5):
            
            objective_function = self._compute_objective_function_() 
     
            self._update_maps_(option = 'rows')
            self._update_maps_(option = 'columns')
            
            # break conditions
            if (math.fabs(1 - objective_function) <0.00005):
                break
            
            if(math.fabs(objective_function - of_last_value) > 0.001):
                of_last_value = objective_function
                n_repetitions = 0
            else:
                n_repetitions += 1
                
            n_iterations += 1
        
        print("Finishing core-coclustering")
        print("Number of iterations " + str(n_iterations))
        print("Objective function " + str(math.fabs(objective_function)))
        print("Number of repetitions " + str(n_repetitions))
    
    def _update_maps_(self,option = 'rows'):
        
        if(option == 'rows'):
            mapping_array = self._rho
            data = self._D
        elif (option == 'columns'):
            mapping_array = self._gamma
            data = self._D.T
            
        for i in range(0,len(data)):
            self._compute_arg_max_(data,mapping_array,i)
    
    def _compute_arg_max_(self,data,mapping_array,index):
        
        actual_coherence = 0
        previous_cluster = mapping_array[index]
        if (list(mapping_array).count(previous_cluster) <= 2):
            return
        prev_coherence = PairBasedCoherence(data[np.where(mapping_array == previous_cluster)]).HP
        
        mapping_array[index] = 0
        
        local_coherence = PairBasedCoherence(data[np.where(mapping_array == previous_cluster)]).HP
        
        if (prev_coherence >= local_coherence):
            mapping_array[index] = previous_cluster
            actual_coherence = prev_coherence
        else:
            actual_coherence = local_coherence
            
        best_cluster,best_coherence = self._iter_maps_(data, mapping_array, index, previous_cluster)
        
        if (best_coherence > local_coherence):
            mapping_array[index] = best_cluster
    
    def _iter_maps_(self, data, mapping_array, index, previous_cluster):
        
        best_coherence = 0
        best_cluster = 0
        
        for actual_cluster in range(1,int(max(mapping_array))):
            if actual_cluster == previous_cluster:
                pass
            else:
                mapping_array[index] = actual_cluster
                coherence = PairBasedCoherence(data[np.where(mapping_array == actual_cluster)]).HP
                if(coherence >= best_coherence):
                    best_coherence = coherence
                    best_cluster = actual_cluster
                    
        return best_cluster, best_coherence

In [97]:
rapooc = Rapooc_test(data,2,2,4)

In [None]:
rapooc.initialize()

In [None]:
rapooc.core_co_clustering()

In [94]:
rapooc.gamma

array([ 1.,  1.,  1.,  0.,  2.,  1.,  2.])

In [95]:
rapooc.rho

array([ 1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,
        0.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,
        0.,  1.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  0.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,  1.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  2.,  2.])

In [49]:
rapooc._compute_objective_function_()
    

0.97521471893999401

In [53]:
len(data.T[:])

7

In [58]:
a = ([0,1,2,3,4])
f = a[0]
a[0] = 300
print (f)

0
