# Three-dimensional STSSCAN 

In [1]:
%matplotlib inline
%load_ext pycodestyle_magic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import glob
import multiprocessing
import copy

from operator import attrgetter
from sklearn.datasets import make_biclusters
from sklearn.datasets import samples_generator as sg
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

from metrics import *

In [2]:
class Bicluster(object):
    
    def __init__(
        self, 
        data, 
        rho=None,
        gamma=None, 
        bisected=False,
        H_max=None, 
        best_bisection=None
    ):
        
        """
        Bicluters structure
        
        
        :param rho: <np.array[int]> the map of the indices of rows of the bicluster i.e [2,4,7,8]
        :param gamma: <np.array[int]> the map of gammas, same structure f rho
        :param bisected: <boolean> indicates if the bicluster has been bisected recentky
        :param h_max: <float> best h from the last bisection
        :paran best_bisection: <np.array[int]> mapping array from the best bisection
        
        *Note: Noise will be represented as -1
        
        """
        
        self.gamma = gamma
        self.rho = rho
        self.bisected = bisected
        self.h_max = H_max
        self.bisected_direction = best_bisection.flg_direction if best_bisection is not None else None
        self.best_bisection = best_bisection
        self.data = data[np.ix_(self.rho, self.gamma)]

    def set_bisected(self):
        
        self.bisected = not self.bisected
    
    def __str__(self):
        
        return ("rho: \n"+ str(self.rho) + "\ngamma: \n" + str(self.gamma) + "\nH: \n " + str(self.H))
                
    def row_length(self):
        
        return len(self.rho>=0)
    
    def col_length(self):
        
        return len(self.gamma>=0)
    
    def set_fields(self, best_bisection, h_max, bisected_direction):
        
        """
        Method
        
        :param best_bisection: <np.array[int]> Mapping array from the best bisection 
        :param h_max: <float> Best H from the last bisection
        :bisected_direction: <String> Direction from the last bisection ('cols' or 'rows')
        """
        
        self.best_bisection = best_bisection
        self.h_max = h_max
        self.bisected_direction = bisected_direction
        final_map = best_bisection == 0 # filtering noise then zeros

        if(bisected_direction == 'cols'):
            
            self.gamma = self.gamma[final_map]
            print(self.gamma)
            
        else:
            
            self.rho = self.rho[final_map]
            
    def compute_H(self, data):
        
        """
        Method:
        Computes the coherence H of the data
        """
        
        self.data = data[np.ix_(self.rho, self.gamma)]
        self.H = three_dimensional_msr(self.data)
        
        return self.H

In [3]:
def split_cols(data, min_coherence):
    
    """
    Function:
    Spits the bicluster according to the best split in of columns
    
    :param data: <np.array[np.array]> data of the bicluster to split
    :param bicluster: <Bicluster> data 
    :param min_coherence: <float> minimal coherence of a bicluster
    """
    
    t_data = data.transpose(1,0,2)
    aux_I, aux_J = t_data.shape[0:2]
    aux_indices = np.zeros(aux_I)
    middle = 0
    max_h = 0
    temp_max = 0
    flg_noise_cleaning = False
    temp_h_right = 0
    temp_h_left = 0
    
    for i in range(1, aux_I-1):

        temp_h_right = three_dimensional_msr(t_data[i:len(t_data)])
        temp_h_left = three_dimensional_msr(t_data[0:i+1])
        temp_max = max(temp_h_right, temp_h_left)
        
        if (temp_max >= max_h):
            
            max_h = temp_max
            middle = i
            
    indices = np.ones(aux_I)
    zeros = np.zeros(middle)
    
    if len(indices) == 1 or temp_h_left <= min_coherence:
        indices = np.array(-1*np.ones(len(indices)))
        flg_noise_cleaning = True
        
    if len(zeros) == 1 or temp_h_right <= min_coherence:
        zeros = np.array(-1*np.ones(len(zeros)))
        flg_noise_cleaning = True
        
    indices[0:middle] = zeros
    indices = np.array(indices)  
    unique_indices = np.unique(indices)
    
    return (indices, max_h, flg_noise_cleaning)

In [4]:
def transform_dm(dm):
    dm[dm < 0] = 0
    dm = 1-dm**4
    np.fill_diagonal(dm, 0)
    
    return dm

In [5]:
def split_rows(data, min_cluster_size):
    
    """
    Apply DBSCAN over the bicluster to filter noise
    
    :param bicluster: <Bicluster> data
    :param min_cluster_size
    """
    
    n_cols = data.shape[1]
    dm = np.asarray([[((three_dimensional_coherence(p1, p2)) \
                       if (three_dimensional_coherence(p1, p2)) \
                       != 1 \
                       else 0.0) 
                      for p2 in data]\
                     for p1 in data])
    dm = transform_dm(dm)
    gmm = GaussianMixture(n_components=3)
    gmm.fit(np.array([(dm).flatten()]).T)
    eps = gmm.means_[np.where(gmm.weights_\
                              ==(min(gmm.weights_)))].flatten()
    dev = gmm.covariances_.flatten()[np.where(gmm.weights_\
                                              ==(min(gmm.weights_)))]
    lower_bound = 0 #eps - 8*dev
    rows = np.array([])
    
    
    while (eps) > lower_bound:
        
        db = DBSCAN(eps = eps ,metric='precomputed', 
                    min_samples=min_cluster_size,
               ).fit(dm)
        
        # print(db.labels_)
        rows = db.labels_
        
        if(len(np.unique(db.labels_))<3):
            
            eps -= dev/4
            if eps <= 0:
                
                eps = 1.0e-15
                break
        else:
            
            break

    map_array = rows
    tmp_bicluster_label = 0
    max_h = 0
    
    for i in np.unique(rows):
        
        if i == -1:
            
            pass
        
        else:
            
            temp_h = three_dimensional_msr(data[np.where(rows == i)])
            
            if temp_h >= max_h:
                max_h = temp_h
                tmp_bicluster_label = i

    final_array = np.array([1 if value == tmp_bicluster_label else (0 if value != -1 else -1) 
                            for value in db.labels_])
    map_array = final_array
    flg_noise_cleaning = False
    unique_array = np.unique(final_array)
    
    if (len(unique_array) == 2 and unique_array[0] == -1):
        
        flg_noise_cleaning = True
    
    new_map = map_array
        
    if ((np.unique(map_array)[0] == -1) and len(np.unique(map_array)) == 1):
        
        print("all negatives", len(map_array))
        new_map = np.ones(int(len(map_array)))
        zeros = np.zeros(int(len(map_array)/2))
        new_map[0:int(len(map_array)/2)] = zeros
        print(new_map)
        flg_noise_cleaning = False
        map_array = new_map.copy()
        max_h = 0
    
    return (map_array, max_h, flg_noise_cleaning)

In [6]:
def compute_bicluster_coherence(bicluster, 
                                min_cluster_row_size,
                                min_coherence):
    
    """
    Splits bicluster and chooses wether its better to split rows or columns
    
    :param bicluster: <Bicluster> Bicluster to split
    :param min_coherence: <float> minimal coherence of a bicluster
    """
    
    print("Compute Coherence")
    
    flg_noise_cleaning_rows = False
    flg_noise_cleaning_cols = False
    
    if (bicluster.bisected):
    
        return (bicluster.best_bisection, bicluster.h_max, bicluster.bisected_direction)
    else:
        
        map_array_cols, h_cols, flg_noise_cleaning_cols = split_cols(bicluster.data, min_coherence=min_coherence)
        map_array_rows, h_rows, flg_noise_cleaning_rows = split_rows(bicluster.data, min_cluster_row_size)
        print("\nmap array rows len ", len(map_array_rows))
        # print("h cols " + str(h_cols) + " h rows " + str(h_rows))
        
        if (h_cols > h_rows):
            
            if flg_noise_cleaning_cols:
                
                return (map_array_cols, h_cols, "clean cols")
            else:
 
                return (map_array_cols, h_cols, "cols")
        else:
            
            if flg_noise_cleaning_rows:
                
                return (map_array_rows, h_rows, "clean rows")
            
            return (map_array_rows, h_rows, "rows")

In [17]:
class STSSCAN(object):

    def __init__(self, data, n_clusters=2, min_coherence = 0.9):
        """
        STSSCAN
        
        Applies biclustering over data
        
        :param data: <np.array> The data to apply biclusters
        :param n_clusters: <int> The number of biclusters to find
        
        """

        assert data is not None, 'Empty data'
        self._data = data
        (self._I, self._J) = self._data.shape[0:2]
        self._n_clusters = n_clusters
        self._biclusters = list()
        self._objective_function = 0
        self._min_coherence = min_coherence

    @property
    def n_clusters(self):
        return self._n_clusters

    @property
    def data(self):
        return self._data

    @property
    def biclusters(self):
        return self._biclusters

    def fit(self):
        """
        Fits the data on the algorithm. Iters over the 
        number of biclusters until find the requested number
        """

        print ('Fitting data ')
        n_iterations = 0
        n_clusters = self._n_clusters
        initial_bicluster = Bicluster(self._data, np.arange(self._I),
                np.arange(self._J))
        self._biclusters.append(initial_bicluster)
        
        while len(self._biclusters) != self.n_clusters:
            
            print('\nN iteration ' + str(n_iterations))
            
            self._split_biclusters() 
            n_iterations += 1
        # self._prune()

    def _split_biclusters(self):
        """
        Splits bicluters in order to find the next biclusters that optimizes
        the sum of coherences among biclusters
        """

        # import ipdb; ipdb.set_trace()

        print()
        print('Splitting biclusters ')

        h_max = 0
        tmp_bicluster_delta = 0
        tmp_bisection = None
        tmp_split = 'cols'
        tmp_bisection_direction = None
        position = 0
        best_position = 0
        best_bisection_array = np.array([])
        best_bisection_direction = None

        while position != len(self._biclusters):

            bicluster = self._biclusters[position]
            
            print()
            
            print('Position ' + str(position))
            
            print('\nRow length ' + str(bicluster.row_length()) + ' Col length ' + str(bicluster.col_length()) + '\n')
            
            min_cluster_row_size = math.floor(np.log(self._I) * 5)

            if bicluster.row_length() >= min_cluster_row_size and bicluster.col_length() > 3:

                (tmp_bisection, tmp_bicluster_delta, tmp_split) = compute_bicluster_coherence(bicluster, min_cluster_row_size, self._min_coherence)

                if tmp_split == 'clean cols':

                    index_map = tmp_bisection.copy() >= 0
                    rho = self._biclusters[position].rho.copy()
                    new_gamma = self._biclusters[position].gamma.copy()[index_map]
                    self._biclusters[position] = Bicluster(self._data,
                            rho, new_gamma)
                    best_bisection_direction = 'clean'
                    
                    print('\nCleaned cols at ' + str(position))
                    
                elif tmp_split == 'clean rows':

                    index_map = tmp_bisection.copy() >= 0
                    rho = self._biclusters[position].rho.copy()[index_map]
                    new_gamma = self._biclusters[position].gamma.copy()
                    self._biclusters[position] = Bicluster(self._data,
                            rho, new_gamma)
                    best_bisection_direction = 'clean'
                    print('\nCleaned rows at ' + str(position))
                else:

                    if tmp_bicluster_delta >= h_max:
                        
                        h_max = tmp_bicluster_delta
                        best_position = position
                        best_bisection_array = tmp_bisection.copy()
                        best_bisection_direction = ('cols'
                                 if tmp_split == 'cols' else 'rows')
            position += 1

        if best_bisection_direction != 'clean':
            
            print('\nBisected bicluster at ' + str(best_position))
            best_bisection = best_bisection_array
            self._add_bicluster(best_position, best_bisection, h_max,
                                best_bisection_direction)

    def _add_bicluster(
        self,
        best_position,
        best_bisection,
        h_max,
        bisection_direction,
        ):
        
        """
        Adds the found bicluster into the list of labels
        
        :param best_position: <int> position of the best bicluster
        :param best_bisection: <np.array[int]> map_array of the best bisection
        :param max_delta: h_max 
        """

        # Create the new bicluster

        best_rho = self._biclusters[best_position].rho
        best_gamma = self._biclusters[best_position].gamma
        index_map = best_bisection >= 1
        aux_index_map = best_bisection == 0
        
        if bisection_direction == 'cols':
            
            new_rho = best_rho.copy()
            new_gamma = best_gamma[index_map].copy()
            aux_rho = best_rho.copy()
            aux_gamma = best_gamma[aux_index_map].copy()
            
            # print('\nAdded bicluster cols: ' + str(new_gamma))
            
        else:
            
            new_gamma = best_gamma.copy()
            new_rho = best_rho[index_map].copy()
            aux_rho = best_rho[aux_index_map].copy()
            aux_gamma = best_gamma.copy()
            
            # print('\nAdded bicluster rows: ' + str(new_rho))
            
        new_bicluster = Bicluster(self._data, new_rho, new_gamma)

        self._biclusters.append(new_bicluster)

        # Modify the bicluster that is going to be bisected

        self._biclusters[best_position] = Bicluster(self._data, aux_rho, aux_gamma)

    def _prune(self):

        for position in range(0, len(self._biclusters)):

            print(position)

            rho = self._biclusters[position].rho
            gamma = self._biclusters[position].gamma

            if len(gamma) > 0:

                stds = np.std(self._data[np.ix_(rho, gamma)], axis=0)
                std = np.mean(stds)
                lst_map = list()

                print('std ' + str(std))

                for index in [0, len(stds) - 1]:
                    
                    if stds[index] <= std * 2:
                        
                        lst_map.append(1)
                    else:
                        
                        lst_map.append(0)

                print(lst_map)
    
    def merge(self):
        
        index = 0
        
        for index in range(0,len(self._biclusters)):
            
            sub_index = 0
            
            for sub_index in range(index+1,len(self._biclusters)):
                
                if (max(self._biclusters[index].gamma) == min(self._biclusters[sub_index].gamma) and 
                    self._biclusters[index].rho == self._biclusters[sub_index].rho
                   ):
                    
                    print("Merge " + str(index) + " with " + str(sub_index)) 


In [49]:
data = pd.read_csv("../data/Raw/Erk.csv", sep=";")
neg_data = data[data["Pvalue"]>0.5]
pos_data = data[(data["Pvalue"]<=0.01)]

In [50]:
max_value = data[data.columns[5:]].max().max()

In [51]:
min_value = data[data.columns[5:]].min().min()

In [61]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(pos_data, train_size=0.9)
neg_test = neg_data.sample(len(train) + len(test))
train_cause = train[train.columns[5:14]]
train_cause = (train_cause - max_value) / (min_value - max_value)
train_effect = train[train.columns[14:]]
train_effect = (train_effect - max_value) / (min_value - max_value)



In [222]:
neg_test = neg_data.sample((len(train) + len(test))*10)

In [127]:
test_cause = test[test.columns[5:14]]
test_cause = (test_cause - max_value) / (min_value - max_value)
test_effect = test[test.columns[14:]]
test_effect = (test_effect -max_value) / (min_value - max_value)

In [223]:
neg_cause = neg_test[neg_test.columns[5:14]]
neg_cause = (neg_cause - max_value) / (min_value - max_value)
neg_effect = neg_test[neg_test.columns[14:]]
neg_effect = (neg_effect - max_value) / (min_value - max_value)

In [326]:
neg_test[['CauseGene','EffectGene']].sample(25)

Unnamed: 0,CauseGene,EffectGene
171444,MAPK3,RASGRF2
79857,MAPK1,SLC4A5
51850,MAPK1,MCC
150457,MAPK3,LY86-AS1
108796,MAPK3,BTBD10
122679,MAPK3,DCC
24131,MAPK1,DMBX1
98977,MAPK1,ZNF611
70768,MAPK1,RABL6
4146,MAPK1,AP4E1


In [62]:
train_trajectories = list()

for i in range(len(train_cause.columns)):
    
    train_trajectories.append(np.dstack((train_cause[train_cause.columns[i]], 
                                         train_effect[train_effect.columns[i]]))[0]
                             )
train_trajectories = np.transpose(np.array(train_trajectories),[1,0,2])
train_trajectories = np.array(train_trajectories)

In [224]:
neg_trajectories = list()

for i in range(len(neg_cause.columns)):
    
    neg_trajectories.append(np.dstack((neg_cause[neg_cause.columns[i]],
                                      neg_effect[neg_effect.columns[i]]))[0]
                           )
neg_trajectories = np.transpose(np.array(neg_trajectories),[1, 0, 2])
neg_trajectories = np.array(neg_trajectories)

In [130]:
test_trajectories = list()

for i in range(len(test_cause.columns)):
    
    test_trajectories.append(np.dstack((test_cause[test_cause.columns[i]],
                                      test_effect[test_effect.columns[i]]))[0]
                           )
test_trajectories = np.transpose(np.array(test_trajectories),[1, 0, 2])
test_trajectories = np.array(test_trajectories)

27

In [64]:
def print_biclusters(stsscan):
    
    for i in range(len(stsscan.biclusters)):
        print()
        print("bicluster ", i)
        print("rho:\n", stsscan.biclusters[i].rho, "\n\ngamma:\n", stsscan.biclusters[i].gamma)

In [65]:
stsscan2 = STSSCAN(train_trajectories, 2, 0.7)

In [66]:
stsscan2.fit()

Fitting data 

N iteration 0

Splitting biclusters 

Position 0

Row length 237 Col length 9

Compute Coherence
all negatives 237
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1

In [71]:
stsscan2.biclusters[1].gamma

array([7, 8])

In [72]:
stsscan3 = STSSCAN(train_trajectories, 3, 0.7)
stsscan3.fit()

Fitting data 

N iteration 0

Splitting biclusters 

Position 0

Row length 237 Col length 9

Compute Coherence
all negatives 237
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1

In [74]:
stsscan4 = STSSCAN(train_trajectories, 4, 0.7)
stsscan4.fit()

Fitting data 

N iteration 0

Splitting biclusters 

Position 0

Row length 237 Col length 9

Compute Coherence
all negatives 237
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1

In [76]:
stsscan5 = STSSCAN(train_trajectories, 5, 0.7)
stsscan5.fit()

Fitting data 

N iteration 0

Splitting biclusters 

Position 0

Row length 237 Col length 9

Compute Coherence
all negatives 237
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1

# Discriminative biclustering

In [89]:
def compute_average_trajectory(bicluster):
    
    average_trajectory = np.array([np.mean(bicluster.data[:,i], axis = 0) 
                                   for i in range(bicluster.data.shape[1])])
    
    return average_trajectory

In [280]:
compute_average_trajectory(stsscan2.biclusters[0]).shape

(7, 2)

In [227]:
def compute_average_coherence(avg_trajectory, bicluster):
    
    acc = 0.0
    data = bicluster.data
    deviation = list()
    for i in range(data.shape[0]-1):
        
        acc += three_dimensional_coherence(avg_trajectory, data[i])
        deviation.append(three_dimensional_coherence(avg_trajectory, data[i]))
    deviation = np.array(deviation)
    return (acc/(data.shape[0]*1.0), np.std(deviation))

In [296]:
def discriminative_score(pos_data, bicluster, neg_data):
    
    avg_trajectory = compute_average_trajectory(bicluster)
    avg_coherence, std_coherence = compute_average_coherence(avg_trajectory, bicluster)
    pos_rate = 1.0 * ((len(bicluster.rho) * len(bicluster.gamma))/(pos_data.shape[0] * pos_data.shape[1]))
    test_data = neg_data[:, bicluster.gamma]
    print(std_coherence)
    print("len gamma", len(bicluster.gamma))
    print("neg data shape", neg_data.shape)
    neg_instances = 0
    
    for i in range(test_data.shape[0]):
        
        correlation = three_dimensional_coherence(test_data[i], avg_trajectory)
        
        if avg_coherence + std_coherence <= correlation:
            
            neg_instances += 1
            
    print("neg instances", neg_instances)
    neg_rate = 1.0 * ((neg_instances*len(bicluster.gamma)) / (neg_data.shape[0] * neg_data.shape[1]))
    
    print("positive rate", pos_rate, " negative rate ", neg_rate)
    return (pos_rate >= neg_rate)

In [297]:
lst_stsscan = [stsscan2, stsscan3, stsscan4, stsscan5]

In [298]:
stsscan2.biclusters[0].gamma

array([0, 1, 2, 3, 4, 5, 6])

In [299]:
discriminative_score(train_trajectories, stsscan2.biclusters[0], train_trajectories)

0.000430444116344
len gamma 7
neg data shape (237, 9, 2)
neg instances 237
positive rate 0.7777777777777778  negative rate  0.7777777777777778


True

In [300]:
def analyze_membership_stsscan(lst_stsscan, positive, negative):
    
    i = 0
    
    for sts in lst_stsscan:
        
        print("\nsts n°", i)
        b = 0
        
        for bicluster in sts.biclusters:
            
            print("\nbicluster n°", b, "\n")
            
            if (discriminative_score(positive, bicluster, negative)):
                
                print("\ndiscriminant bicluster")
            else:
                
                print("\non discriminant bicluster")
                
            b += 1
            
        i += 1
            
        print("=============================")

In [301]:
len(neg_trajectories)

2640

In [302]:
analyze_membership_stsscan(lst_stsscan, train_trajectories, neg_trajectories)


sts n° 0

bicluster n° 0 

0.000430444116344
len gamma 7
neg data shape (2640, 9, 2)
neg instances 2640
positive rate 0.7777777777777778  negative rate  0.7777777777777778

discriminant bicluster

bicluster n° 1 

0.000671990281184
len gamma 2
neg data shape (2640, 9, 2)
neg instances 2634
positive rate 0.2222222222222222  negative rate  0.22171717171717173

discriminant bicluster

sts n° 1

bicluster n° 0 

0.000477934842652
len gamma 5
neg data shape (2640, 9, 2)
neg instances 2640
positive rate 0.5555555555555556  negative rate  0.5555555555555556

discriminant bicluster

bicluster n° 1 

0.000671990281184
len gamma 2
neg data shape (2640, 9, 2)
neg instances 2634
positive rate 0.2222222222222222  negative rate  0.22171717171717173

discriminant bicluster

bicluster n° 2 

0.000832192731164
len gamma 2
neg data shape (2640, 9, 2)
neg instances 2591
positive rate 0.2222222222222222  negative rate  0.2180976430976431

discriminant bicluster

sts n° 2

bicluster n° 0 

0.0001575962825

In [162]:
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
from scipy.spatial import distance

In [163]:
def compute_membership_scores(trajectory, bicluster, mean_trajectory):
    
    def compute_deviation(column, mean_coord):
        
        distances = np.array([distance.euclidean(row, mean_coord) for row in column])
        
        return np.std(distances)
        
        
    scores = list()
    j=0
    biclusterT = np.transpose(bicluster.data,[1,0,2])
    
    for column in biclusterT:
        
        deviation = compute_deviation(column, mean_trajectory[j])
        norm = np.linalg.norm(trajectory[j] - mean_trajectory[j])
        score = min(deviation/norm, 1)
        scores.append(score)
        j += 1
        
    return np.array(scores)

In [166]:
def member(tc, bicluster):
    # print(bicluster.gamma)
    _tc = tc[bicluster.gamma]
    _bicluster = bicluster
    mean_trajectory = bicluster.data.mean(axis=0)
    scores = compute_membership_scores(_tc, bicluster, mean_trajectory)
    total_membership_score = 1.0 - ((1 / len(tc)) *sum(scores))
    
    return total_membership_score

In [317]:
def classify(trajectories, stsscan):
    
    i = 0
    
    for trajectory in trajectories:
        print("\ntrajectory: ", i)
        b = 0
        total_score = 0
        scores = list()
        for bicluster in [stsscan.biclusters[1],stsscan.biclusters[0]] :
                
                total_score += member(trajectory, bicluster)
                scores.append(member(trajectory, bicluster))
                b += 1
                
        print("\ntotal_score: ", total_score/b, " max score ", max(scores))
        i += 1

In [318]:
classify(test_trajectories, stsscan2)


trajectory:  0

total_score:  0.719497332605  max score  0.860121077961

trajectory:  1

total_score:  0.71392138871  max score  0.808728448558

trajectory:  2

total_score:  0.712768483583  max score  0.913595128526

trajectory:  3

total_score:  0.673243175687  max score  0.871243140496

trajectory:  4

total_score:  0.707293634774  max score  0.941928614978

trajectory:  5

total_score:  0.595648218266  max score  0.803473433033

trajectory:  6

total_score:  0.669627400254  max score  0.861121928679

trajectory:  7

total_score:  0.68419809962  max score  0.827490258584

trajectory:  8

total_score:  0.646261094232  max score  0.860492436581

trajectory:  9

total_score:  0.666158219875  max score  0.886291114297

trajectory:  10

total_score:  0.753472762062  max score  0.931161499352

trajectory:  11

total_score:  0.665645242425  max score  0.933935462441

trajectory:  12

total_score:  0.675481221038  max score  0.864792210533

trajectory:  13

total_score:  0.651885750218  ma

In [321]:
classify(neg_trajectories.sample(17), stsscan2)

AttributeError: 'numpy.ndarray' object has no attribute 'sample'

In [308]:
len(lst_stsscan)

4

In [309]:
def mean_msr(biclusters):
    
    total = 0
    for b in biclusters:
        
        total += three_dimensional_msr(b.data)
        
    return (total / len(biclusters))

In [320]:
i = 0
for stsscan in lst_stsscan:    

    print("\nstscan n: ", i)
    print("\n mean coherence: ", mean_msr(stsscan.biclusters))
    
    i += 1


stscan n:  0

 mean coherence:  0.997112806209

stscan n:  1

 mean coherence:  0.997543717766

stscan n:  2

 mean coherence:  0.998396163106

stscan n:  3

 mean coherence:  0.998702453873
