In [None]:
# enable importing from root dir
import sys
sys.path.append('..')

import pandas as pd

from experiment import Experiment
import algorithms
import visualize
import pynndescent
from sklearn import datasets

sim = pynndescent.distances.cosine

# Load Data

In [None]:
X, y = datasets.fetch_20newsgroups_vectorized(subset="all", return_X_y=True)

In [None]:
from dataset import Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=None)

In [None]:
dataset = Dataset(X_train.toarray(), X_test.toarray(), y_train, y_test)

# Estimate homogeneities and prepre for computation

In [None]:
homogeneities = dict()

for cluster_id in dataset.classes:
    homogeneities[cluster_id] = dataset.get_class_distance_threshold(cluster_id, quantile=None, similarity=sim, sample_rate = 0.5)
    # homogeneities[cluster_id] = dataset.get_class_homogeneity(cluster_id, sim, sample_rate=1)

In [None]:
homogeneities

# Setup and Run Experiment

In [None]:
algs = [algorithms.nndescent_reverse_neighbors,
        algorithms.nndescent_reverse_neighbors,
        algorithms.nndescent_reverse_neighbors,
        algorithms.delta_medoids,
        algorithms.ds3,
        algorithms.random_select]

# crs param order: sample_rate, similarity, K, threshold (homogeneity)
# sample_rate - parameter of NN-Descent - makes it run faster [0-0.5], 0.1 default, 0.3 proposed max
# similarity - similarity used
# K - k used for NNDescent
pars = [{"sample_rate":0.5, "similarity":sim, "K":10},
        {"sample_rate":0.5, "similarity":sim, "K":20},
        {"sample_rate":0.5, "similarity":sim, "K":30},
        {"dist": sim},
        {"similarity":sim},
        {"select": 0.05}]


coverage = .95
exp = Experiment(dataset, coverage, algs, pars, homogeneities)

In [None]:
import datetime
now = datetime.datetime.now()
print(now)

In [None]:
results = exp.run()

In [None]:
import datetime
end = datetime.datetime.now()
print(end)
print(f"duration: {end - now}")

# Classification

In [None]:
import classifier

In [None]:
C = classifier.KnnClassifier(sim, K=1)

In [None]:
classification_res = dict()
for i, result in enumerate(results):
    # TODO prototypes should be saved separately as the ones selected (possibly with indices to original dataset)
    test_data=dataset.get_test_data()
    key = f"{result.algorithm}_i"
    classification_res[key] = C.classifySklearn(prototypes=(result.samples, result.labels),
                                         test_data=test_data)
    print(key)
    # print(classification_res[key])

# Visualization confusion matrices

In [None]:
matrices = []
labels = []
for i in classification_res.values():
    matrices.append(i)
    labels.append(i[1])

In [None]:
visualize.plot_confusion_matrices(matrices, ['CRS-k40', 'Delta Medoids', 'DS3', 'Random Select'], labels).show()

In [None]:
# calculating precision recall from conf_matrix
precision_recall_dfs = [None]*len(matrices)
labels = ['cluster', 'size', 'selected', 'precision', 'recall']
precisions = []
for i, matrix in enumerate(matrices):
    cols = visualize.calculate_precision_recall(matrix, results[i].labels, dataset)
    precisions.append(cols[-1][0])
    precision_recall_dfs[i] = visualize.resultDF(labels, cols)

In [None]:
precisions

In [None]:
for df in precision_recall_dfs:
    df['cluster'] = df['cluster'].apply(pd.to_numeric)
    visualize.pandas_df_to_markdown_table(df.sort_values(by=['cluster']))

# Visualization precision/recall

In [None]:
matrices = []
labels = []
for i in classification_res.values():
    matrices.append(i)
    labels.append(i[1])

In [None]:
visualize.plot_confusion_matrices(matrices, ['CRS-k10', 'Delta Medoids', 'Random Select'], labels).show()

In [None]:
# calculating precision recall from conf_matrix
precision_recall_dfs = [None]*len(matrices)
labels = ['cluster', 'size', 'selected', 'precision', 'recall']
precisions = []
for i, matrix in enumerate(matrices):
    cols = visualize.calculate_precision_recall(matrix, results[i].labels, dataset)
    precisions.append(cols[-1][0])
    precision_recall_dfs[i] = visualize.resultDF(labels, cols)

In [None]:
precisions

In [None]:
for df in precision_recall_dfs:
    df['cluster'] = df['cluster'].apply(pd.to_numeric)
    visualize.pandas_df_to_markdown_table(df.sort_values(by=['cluster']))