In [None]:
import matplotlib.pyplot as plt
import numpy as np

import data_loader
import algorithms
import visualize
import pynndescent
from experiment import Experiment

sim = pynndescent.distances.cosine

# Load Data

In [None]:
dataset = data_loader.read_npy('datasets_open/spirals150.npy')
# dataset = data_loader.read_npy('datasets_open/blobs.npy')


# Estimate homogeneities and prepre for computation

In [None]:
homogeneities = dict()

for cluster_id in dataset.classes:
    homogeneities[cluster_id] = dataset.get_class_distance_threshold(cluster_id, quantile=None, similarity=sim, sample_rate = 1.0)
    # homogeneities[cluster_id] = dataset.get_class_homogeneity(cluster_id, similarity=sim, sample_rate = 0.5)

print(homogeneities)

# Setup and Run Experiment

In [None]:
algs = [algorithms.nndescent_reverse_neighbors,
        algorithms.nndescent_reverse_neighbors,
        algorithms.delta_medoids,
        algorithms.random_select]
# crs param order: sample_rate, similarity, K, threshold (homogeneity)
# sample_rate - parameter of NN-Descent - makes it run faster [0-0.5], 0.1 default, 0.3 proposed max
# similarity - similarity used
# K - k used for NNDescent
pars = [{"sample_rate":0.5, "similarity":sim, "K":10},
        {"sample_rate":0.5, "similarity":sim, "K":20},
        {"dist": sim},
        {"select": 0.05}]

coverage = .95
exp = Experiment(dataset, coverage, algs, pars, homogeneities)

In [None]:
results = exp.run()

## Classification

In [None]:
import classifier

In [None]:
C = classifier.KnnClassifier(sim, K=1)

In [None]:
classification_res = dict()
for result in results:
    # TODO prototypes should be saved separately as the ones selected (possibly with indices to original dataset)
    test_data=dataset.get_test_data()
    classification_res[result.algorithm] = C.classifySklearn(prototypes=(result.samples, result.labels),
                                         test_data=test_data)
    print(result.algorithm)
    print(classification_res[result.algorithm])

## Visualization

In [None]:
matrices = []
labels = []
for i in classification_res.values():
    matrices.append(i)
    labels.append(i[1])

In [None]:
visualize.plot_confusion_matrices(matrices, ['CRS-k10', 'Delta Medoids', 'Random Select'], labels).show()

In [None]:
# calculating precision recall from conf_matrix
precision_recall_dfs = [None]*len(matrices)
labels = ['cluster', 'size', 'selected', 'precision', 'recall']
precisions = []
for i, matrix in enumerate(matrices):
    cols = visualize.calculate_precision_recall(matrix, results[i].labels, dataset)
    precisions.append(cols[-1][0])
    precision_recall_dfs[i] = visualize.resultDF(labels, cols)

In [None]:
precisions

In [None]:
for df in precision_recall_dfs:
    df['cluster'] = df['cluster'].apply(pd.to_numeric)
    visualize.pandas_df_to_markdown_table(df.sort_values(by=['cluster']))

## Visualising simple datasets

In [None]:
def plot_colored(samples, labels, background_samples, background_labels, plt):
    colors = ['b', 'r', 'g']
    colors_dimmed = ['#CCE5FF', '#FFCCCC', '#CCFFCC']

    colored = dict()
    for i, c in enumerate(colors):
        x = [j[0] for ix,j in enumerate(samples) if i == labels[ix]]
        y = [j[1] for ix,j in enumerate(samples) if i == labels[ix]]
        colored[c] = (x,y)
        
    background = dict()
    for i, c in enumerate(colors_dimmed):
        x = [j[0] for ix,j in enumerate(background_samples) if i == background_labels[ix]]
        y = [j[1] for ix,j in enumerate(background_samples) if i == background_labels[ix]]
        background[c] = (x,y)
    
    # Plot"
    for c, cd in zip(colors, colors_dimmed):
        plt.scatter(background[cd][0], background[cd][1], color=cd)
        plt.scatter(colored[c][0],colored[c][1], color=c, marker='X', s=100)
        plt.axis('off')
    
    plt.xlabel('x')
    plt.ylabel('y')


In [None]:
all_points = np.concatenate(list(dataset.train.values()))
all_labels = np.concatenate([[k]*len(v) for k, v in dataset.train.items()])

fig = plt.figure(figsize=(32,6))
axes = []

As = ['CRS-10', 'CRS-20']#, 'CRS-40',]
# As = ['CRS-10', 'CRS-20', 'delta-Medoids', 'DS3']#, 'Random Select']

for i in range(len(As)):
    prototype_data = results[i].samples
    prototype_labels = results[i].labels
    axes.append(plt.subplot(161+i))
    plot_colored(prototype_data, prototype_labels, all_points, all_labels, plt)

import math
print(len(axes))
for a, ax in zip(As, axes):
    ax.set_title(a, size=18)


fig.savefig('struct.png', bbox_inches='tight')