In [1]:
import os
import numpy as np
from scipy.spatial import distance
from scipy.stats import entropy
from random import gauss, randrange
import pickle
from itertools import combinations, product

In [37]:
def make_dataset(n,m,kmin,kmax):
    data = []
    data.append([n,m,kmin,kmax])
    
    for i in range(n):
        d = [abs(round(gauss(j+m, kmax), 2)) for j in range(m)]
        data.append(d)
        
    with open('data/test3.dat', 'wb') as file:
        pickle.dump(data, file)
        
        
def read_data(path):
    data = pickle.load(open(path, 'rb'))
    spec = data.pop(0)
    
    return np.array(data), spec

def write_results(results):
    
    with open('test3.res', 'w') as fp:
        fp.write('\n'.join('{} {}'.format(x[0], round(x[1], 2)) for x in results))

In [12]:
class SubSpace_Clustering:
    def __init__(self, data, th_entropy):
        self.data = data
        n,m = self.data.shape
        self.variables = np.arange(0, m)
        self.sig_var = []
        self.th_entropy = th_entropy
        self.var_dict = dict()
        
        for var in self.variables:
            data_var = self.data[:, var]
            
            delta = round((data_var.max() - data_var.min()) * 0.1, 4)
            
            interim_var = np.arange(0, data_var.max() + delta, delta)
            
            self.var_dict[var] = interim_var

    
    def entropy(self, variables):
        data = self.data[:, variables]

        interim_var = []
        for var in variables:
            interim_var.append(self.var_dict[var])
            
        comp = np.zeros((len(variables), ))
        grid = []
        for p in product(*interim_var):
            
            count = 0
              
            for d in data:

                if (d > comp).any() and (d <= list(p)).any():
                    count += 1
                    
                
                    
            
            grid.append(count / len(data))
            comp = p
               
        return entropy(grid, base=2)    
            
        
    def apriori(self, variables, set_size=1):
        if set_size > 3 or len(variables) == 1:
            return self.sig_var
        
        print('running apriori using {} variables'.format(set_size))
        candidates = list(combinations(variables, set_size))
        vars = []
        for var in candidates:
            
            e = self.entropy(var)
            
            if e <= self.th_entropy:
                
                self.sig_var.append(var)
                for v in var:
                    if v not in vars:
                        vars.append(v)
        set_size += 1  
        
        print('subspace: {}'.format(self.sig_var))
        return self.apriori(vars, set_size)
    
    def fit(self, length=3):
        
        sig_var = self.apriori(self.variables)
        
        
        return sig_var
        
    
class KMeans:
    
    def __init__(self, data, K, n_iter=20):
        self.data = data
        self.K = K
        self.n_iter = n_iter
        self.n_samples, self.n_features = self.data.shape
        self.centroids = np.random.rand(K, self.n_features)
        self.clusters = np.zeros((self.n_samples, ), dtype=int)
        self.loss = 0
    def __distance(self, x, m):
        return distance.euclidean(x,m)
    
    def __assign(self):
        loss = []
        for i, x in enumerate(self.data):
            distances = np.zeros((self.K, ), dtype=np.float64)
            for j, m in enumerate(self.centroids):
                distances[j] = self.__distance(x, m)
            loss.append(np.mean(distances))  
            self.clusters[i] = np.argmin(distances)
            
        self.loss = np.mean(loss)
            
    def __recompute_centroids(self):
        
        for k in range(self.K):
            k_indexes = np.where(self.clusters == k)[0]
            if k_indexes.size == 0:
                continue
            k_data = self.data[k_indexes]
            new_centroid = np.mean(k_data, axis=0)
            self.centroids[k] = new_centroid
    
    def fit(self):
        for i in range(self.n_iter):
            self.__assign()
            self.__recompute_centroids()

In [4]:
make_dataset(200, 8, 5, 10)

In [27]:
data, spec = read_data('data/test3.dat')
n,m,kmin,kmax = spec

In [14]:
sbc = SubSpace_Clustering(data, 8)
subspace = sbc.fit()

running apriori using 1 variables
subspace: [(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,)]
running apriori using 2 variables
subspace: [(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (3, 4), (3, 5), (3, 6), (3, 7), (4, 5), (4, 6), (4, 7), (5, 6), (5, 7), (6, 7)]
running apriori using 3 variables
subspace: [(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (3, 4), (3, 5), (3, 6), (3, 7), (4, 5), (4, 6), (4, 7), (5, 6), (5, 7), (6, 7)]


In [28]:
test = data[:, list(subspace[-1])]

In [29]:
results = []
for k in range(kmin, kmax + 1):
    kmean = KMeans(test, k)
    kmean.fit()
    results.append((k, kmean.loss))

In [38]:
write_results(results)