In [1]:
import json
from typing import List, Tuple
from collections import defaultdict

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy as sp
from scipy import sparse
from scipy.stats import ortho_group
from scipy.sparse.linalg import eigsh
import sklearn as sk
from sklearn.cluster import KMeans

import model
from server import parse_request

In [2]:
def load_example(filename: str) -> model.Request:
    with open(filename) as file:
        return parse_request(json.load(file))

In [3]:
def to_matrix(triples: List[model.Triplet], shape: Tuple[int, int]):
    row = list(map(lambda t: t.row, triples))
    col = list(map(lambda t: t.col, triples))
    data = list(map(lambda t: float(t.value), triples))
    return sparse.coo_matrix((data, (row, col)), shape=shape)

In [4]:
def get_relations(req: model.Request):
    relations = defaultdict(dict)
    for rel in req.relations:
        shape = (req.sets[rel.rows].size, req.sets[rel.cols].size)
        matrix = to_matrix(rel.triplets, shape)
        relations[rel.rows][rel.cols] = matrix.todense()
        relations[rel.cols][rel.rows] = np.transpose(matrix).todense()
    return relations

In [5]:
def init_C(dataset: model.DataSet):
    return ortho_group.rvs(dataset.size)[:,:dataset.num_clusters]

In [6]:
def calc_M(sets, relations, indicators, p):
    summands = []
    for j in range(p + 1, len(sets)):
        R = relations[sets[p]][sets[j]]
        C = indicators[j]
        RC = np.matmul(R, C)
        RCCT = np.matmul(RC, C.T)
        summands.append(np.matmul(RCCT, R.T))
#         summands.append(np.matmul(RC, RC.T))
    for j in range(0, p):
        R = relations[sets[j]][sets[p]]
        C = indicators[j]
        RTC = np.matmul(R.T, C)
        RTCCT = np.matmul(RTC, C.T)
        summands.append(np.matmul(RTCCT, R))
#         summands.append(np.matmul(RTC, RTC.T))
    return np.add.reduce(summands)

In [7]:
def cluster(sets, relations, indicators):
    old_eigenvalues = [0 for _ in range(len(sets))]
    error = [1 for _ in range(len(sets))]
    n_iters = 0
    for _ in range(1200):
        for p in range(0, len(sets)):
            M = calc_M(sets, relations, indicators, p)
            C = indicators[p]
            eigenvalues, eigenvectors = eigsh(M, k=C.shape[1], which="LM")
            error[p] = np.amax(np.abs(old_eigenvalues[p] - eigenvalues))
            old_eigenvalues[p] = eigenvalues
            indicators[p] = eigenvectors
        n_iters += 1
        if all(map(lambda x: x < 1e-8, error)):
            break
    print(n_iters)

In [8]:
req = load_example("example_request.json")
relations = get_relations(req)
sets = list(req.sets.keys())
indicators = list(map(init_C, req.sets.values()))

In [9]:
cluster(sets, relations, indicators)

230


In [16]:
def onehot(length: int, index: int):
    v = np.zeros(length)
    v[index] = 1
    return v

def discretize(indicator):
    k = indicator.shape[1]
    kmeans = KMeans(n_clusters=k, random_state=0).fit(indicator)
    rows = map(lambda c: onehot(k, c), kmeans.labels_)
    return np.vstack(list(rows))

In [17]:
indicators[0]

array([[ 2.96376944e-02, -6.23917732e-01,  3.98119417e-02,
         4.94366377e-02],
       [ 6.42199428e-02, -7.50145836e-02, -4.28772380e-01,
         4.25008676e-02],
       [-4.04320964e-02, -6.48665335e-02,  2.00654018e-01,
         4.07811570e-01],
       [ 6.56397284e-02, -4.59326063e-02,  1.64952052e-01,
         4.26592091e-01],
       [-2.62918597e-01, -9.77751662e-02, -1.81747566e-01,
         1.97279840e-02],
       [ 4.08088453e-03, -3.28959482e-02, -7.09589599e-04,
         9.11498392e-04],
       [-8.48516385e-02,  1.48947793e-02, -1.33580933e-01,
         2.63470250e-02],
       [ 2.54667828e-02, -5.78580301e-01,  4.04034201e-02,
         4.82960391e-02],
       [ 6.47282007e-02, -5.04993327e-02, -5.26089483e-02,
         5.12245777e-01],
       [ 2.95601475e-02, -1.53077564e-01,  1.52543563e-02,
         1.80570463e-02],
       [-3.86716438e-01, -4.77878645e-02, -3.94362763e-01,
         3.40906503e-02],
       [ 7.10460054e-01, -1.42711461e-02, -5.52821802e-01,
      

In [18]:
discretize(indicators[0])

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [None]:
C = indicators[1]
kmeans = KMeans(n_clusters=C.shape[1], random_state=0).fit(C)

In [None]:
kmeans.labels_

In [None]:
indicators[2]

In [None]:
for indicator in indicators:
    indicator = sp.cluster.vq.whiten(indicator)

In [None]:
A = np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]], 'float64')

In [None]:
A

In [None]:
np.flip(A, axis=1)

In [None]:
evalues, evectors = eigsh(M, k=3, which="LM", return_eigenvectors=True)

In [None]:
evalues

In [None]:
evectors

In [None]:
lam = evalues[0]

In [None]:
x = evectors[:, 0]

In [None]:
sp.linalg.norm(evectors[:, 2])

In [None]:
np.matmul(M, x) - np.multiply(x, lam)

In [None]:
np.matmul(M, x)

In [None]:
np.multiply(x, lam)

In [None]:
evectors[:, 0] * evalues[0]

In [None]:
np.eye