In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import glob
import multiprocessing

from operator import attrgetter
from sklearn.datasets import make_biclusters
from sklearn.datasets import samples_generator as sg
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN

from metrics import *

In [2]:
data = pd.read_csv('TestData/SimulatedDataCoherence/BiclusterABC_high.csv',header=None)
scaler = MinMaxScaler()
scaler.fit(data)
data = scaler.transform(data)*2-1

data = np.concatenate((data,data),axis=0)

In [3]:
class Bicluster(object):
    def __init__(self, data, rho=None, gamma=None, H = -1):
        """
        Bicluters structure
        
        :param rho: <np.array[int]> the map of the indices of rows of the bicluster i.e [2,4,7,8]
        :param gamma: <np.array[int]> the map of gammas, same structure f rho
        :param H: <float> the H values of the bicluster
        """
        self.gamma = gamma
        self.rho = rho
        self.H = H
    
    def __str__(self):
        return ("rho: \n"+ str(self.rho) + "\ngamma: \n" + str(self.gamma) + "\nH: \n " + str(self.H))
    
    def row_length(self):
        return len(self.rho)
    
    def col_length(self):
        return len(self.gamma)
    
    def compute_H(self):
        if (self.H<0):
            self.H = PairBasedCoherence(data).HP
        return self.H
    

In [4]:
class MapArray(object):
    def __init__(self, map_array, flg_direction):
        """
        Array that will map the indices whether its a row map or a col map
        """
        self._map_array = map_array
        self._flg_direction = flg_direction
        
    @property
    def map_array(self):
        return self._map_array
    
    @property
    def flg_direction(self):
        return self._flg_direction

In [None]:
def split_cols(data, bicluster):
    """
    Function:
    Spits the bicluster according to the best split in of columns
    
    :param data: <np.array[np.array]> data of the bicluster to split
    :param bicluster: <Bicluster> data 
    """
    

In [None]:
def split_rows()

In [5]:
def compute_objective_function(bicluster):
    

In [10]:
class STSSCAN(object):
    
    def __init__(self, data, n_clusters = 2):
        """
        STSSCAN
        
        Applies biclustering over data
        
        :param data: <np.array> The data to apply biclusters
        :param n_clusters: <int> The number of biclusters to find
        
        """
        assert data is not None, "Empty data"
        self._data = data
        """ Data to apply biclustering """
        self._I, self._J = self._data.shape
        """ Stores shape of data """
        self._n_clusters = n_clusters
        """ Number of clusters to find """
        self._biclusters = list()
        """ List of biclusters composed by Bicluster Class"""
        self._objective_function = 0
        """ Objective function: maximize with boundary 1"""
    
    @property
    def n_clusters(self):
        return self._n_clusters
    
    @property
    def data(self):
        return self._D
    
    @property
    def labels(self):
        return _labels
            
    def fit(self):
        """
        Fits the data on the algorithm. Iters over the 
        number of biclusters until find the requested number
        """
        n_iterations = 0
        n_clusters = 0
        initial_bicluster = Bicluster(np.arange(self._I), np.arange(self._J))
        self._biclusters.append()
        for n_iterations in range(0,n_clusters):
            self._split_biclusters()
                
    def _split_biclusters(self):
        
        """
        Splits bicluters in order to find the next biclusters that optimizes
        the sum of coherences among biclusters
        """
        """ Initialize variables """
        max_delta = 0
        tmp_bicluster_delta = 0
        tmp_bisection = MapArray()
        position = 0
        best_position = 0
        best_bisection = MapArray()
        """ Iterate over the biclusters"""
        for bicluster in self._biclusters:
            """ If shape of the biclusters is within the bounds """ 
            if (bicluster.row_length() >= (np.log(self._I)*5) and bicluster.col_length() >= 2):
                
                bicluster_data = self._data[np.ix_(bicluster.rho, bicluster.gamma)]
                """ Compute the bicluster coherence """
                tmp_bicluster_delta = compute_bicluster_coherence(bicluster_data, tmp_bisection)
                
                """ Store the best split of biclusters """
                if(tmp_bicluster_delta >= max_delta):
                
                    max_delta = tmp_bicluster_delta
                    best_bicluster = bicluster
                    best_position = position
                    best_bisection = tmp_bisection
            position += 1
        """ Add the bicluster to the list of biclusters """
        self._add_bicluster(best_bicluster, best_position)
    
    def _add_bicluster(self, best_bicluster, best_position, bisection):
        """
        Adds the found bicluster into the list of labels
        """
        
        """ If the bisection is on rows """
        if bisection.flg_direction == 'row':
            
            """Create and append the new bicluster spliting rows """
            self._labels[best_position] = Bicluster(best_bicluster.rho[np.flatnonzero(bisection.map_array == 0)]
                                                    ,best_bicluster.gamma)
            self._labels.append(Bicluster(best_bicluster.rho[np.flatnonzero(bisection.map_array == 1)]
                                                    ,best_bicluster.gamma))
        else:
            """Create and append the new bicluster spliting columns """
            self._labels[best_position] = Bicluster(best_bicluster.rho
                                                    ,best_bicluster.gamma[np.flatnonzero(bisection.map_array == 0)])
            self._labels.append(Bicluster(best_bicluster.rho
                                                    ,best_bicluster.gamma[np.flatnonzero(bisection.map_array == 1)]))