In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import yaml
import scipy as sp
import scipy.stats as sp_stats
import math
import tqdm.auto as tqdm

from functional import seq
from pathlib import Path
from typing import Tuple, List
from matplotlib import cm


pd.options.display.float_format = '{:.4f}'.format
plt.rcParams.update({
    "text.usetex": True,
})

NUM_FEATURES_PER_DATASET = {
    "cluto": 2,
    "complex9": 2,
    "letter": 16
}

In [None]:
files = list(Path("../experiments/").rglob("STAT"))
len(files)
metrics_columns = ["Purity", "RAND", "Silhouette", "Davis-Bouldin", "\#Multi Borders"]

In [None]:
def get_class_assignments(file_path: Path) -> List[List[int]]:
    data = file_path.read_text().split("\n")
    dataset = file.parent.name.split("_")[0]
    dataset = dataset if "cluto" not in dataset else "cluto"
    output = []
    for line in data:
        if len(line) == 0:
            continue
        components = line.split(",")
        class_assignments = seq(components[3 + NUM_FEATURES_PER_DATASET[dataset]:]).map(int).list()
        output.append(class_assignments)
    return output

def get_num_border_points_with_multiple_assignments(path: Path) -> int:
    return seq(get_class_assignments(path)).filter(lambda x: len(x) > 1).map(len).sum()

In [None]:
data = []
for file in tqdm.tqdm(files):
    if "sanity_experiment" in file.parent.name or "final" in file.parent.name:
        print("Omitting: {}".format(file.parent.name))
        continue
    with open(file) as f:
        dataset, eps, min_pts, algo, _, filtering = file.parent.name.split("_")
        
        sample = yaml.safe_load(f.read().replace("\t", ""))
        
        sample.update({
            "dataset": dataset if "cluto" not in dataset else "cluto",
            "algorithm": algo,
            "filtering": filtering,
            "model_fitting": sample["-model_fitting"],
            "data_reading": sample["-data_reading"],
            "davis_bouldin": float(sample["davis_bouldin"]),
        })
        
        class_assignments = get_class_assignments(file.parent / "OUT")
        total_border_points = seq(class_assignments).filter(lambda x: len(x) > 1).map(len).sum()
        
        sample["\#Multi Borders"] = total_border_points
        
        
        del sample["-model_fitting"]
        del sample["-data_reading"]
        del sample["runtimes_miliseconds"]
        del sample["input_file"]
        data.append(sample)
        
        
frame = pd.DataFrame(data)
frame = frame.rename(
    columns={
        'purity': 'Purity',
        "davis_bouldin": "Davis-Bouldin",
        "rand": "RAND",
        "silhouette": "Silhouette"
    }
)
frame = frame.fillna(0)

In [None]:
frame.columns

### Epsilon, cluto

In [None]:
def print_eps_for_dataset(dataset):
    data = frame[
        (frame["filtering"] == "none")
        & (frame["minPts"] == 5)
        & (frame["dataset"] == dataset)
    ]
    data = pd.merge(
        data[data["algorithm"] == "base"], 
        data[data["algorithm"] == "tanimoto"],
        on="Eps",
        suffixes=["_base", "_tanimoto"]
    )
    data = data.sort_values(by="Eps")[
        ["Eps"] + seq(
            [[m + "_base", m + "_tanimoto"] for m in metrics_columns]
        ).flatten().list()
    ]
    max_values = data.max()
    for column in data.columns:
        data[column] = data[column].apply(
            lambda x: "\textbf{{{:.4f}}}".format(x) if x == max_values[column] else "{:.4f}".format(x)
        )
    print(data.to_latex(escape=False, index=False))
print_eps_for_dataset("cluto")

### Epsilon, complex9

In [None]:
print_eps_for_dataset("complex9")

### Epsilon, letter

In [None]:
print_eps_for_dataset("letter")

### Min pts, cluto

In [None]:
def print_min_pts_for_dataset(dataset):
    data = frame[
        (frame["filtering"] == "none")
        & (
            (
                (frame["Eps"] == 0.1) & (frame["algorithm"] == "base")
                | (frame["Eps"] == 0.99) & (frame["algorithm"] == "tanimoto")
            )
        )
        & (frame["dataset"] == dataset)
    ]
    data = pd.merge(
        data[data["algorithm"] == "base"], 
        data[data["algorithm"] == "tanimoto"],
        on="minPts",
        suffixes=["_base", "_tanimoto"]
    )
    data = data.sort_values(by="minPts")[
        ["minPts"] + seq(
            [[m + "_base", m + "_tanimoto"] for m in metrics_columns]
        ).flatten().list()
    ]
    max_values = data.max()
    print(len(data))
    for column in data.columns:
        data[column] = data[column].apply(
            lambda x: "\textbf{{{:.4f}}}".format(x) if x == max_values[column] else "{:.4f}".format(x)
        )
    print(data.to_latex(escape=False, index=False))
print_min_pts_for_dataset("cluto")

### Min pts, complex9

In [None]:
print_min_pts_for_dataset("complex9")

### Min pts, letter

In [None]:
print_min_pts_for_dataset("letter")

### Avg calculations for epsilon

In [None]:
def print_avg_num_calculations_for_datasets_for_epsilon():
    data = frame[
        (frame["algorithm"] == "tanimoto")
        & (frame["minPts"] == 5)
    ][["avg_num_calculations_of_similarity", "filtering", "dataset", "Eps"]]
    
    datasets = ["cluto", "complex9", "letter"]
    filtering = ["none", "real", "zpn"]
    out = None
    for d in datasets:
        for f in filtering:
            snippet = data[(data["filtering"] == f) & (data["dataset"] == d)][["Eps", "avg_num_calculations_of_similarity"]]
            snippet = snippet.rename(columns={
                "avg_num_calculations_of_similarity": "avg_{}_{}".format(f, d)
            })
            if out is None:
                out = snippet
                
                print(out.columns)
            else:
                out = pd.merge(
                    out,
                    snippet,
                    on="Eps"
                )
    data = out
    data = data.sort_values(by="Eps")[
        ["Eps"] + [
            "avg_{}_{}".format(f, d) for d in datasets for f in filtering
        ]
    ]
    min_values = data.min()
    print(len(data))
    for column in data.columns[1:]:
        data[column] = data[column].apply(
            lambda x: "\textbf{{{:d}}}".format(int(x)) if (x == min_values[column] and "none" not in column) else "{:d}".format(int(x))
        )
    print(data.to_latex(escape=False, index=False))
print_avg_num_calculations_for_datasets_for_epsilon()

### Avg time for epsilon

In [None]:
def print_time_for_datasets_for_epsilon():
    data = frame[
        (frame["algorithm"] == "tanimoto")
        & (frame["minPts"] == 5)
    ][["model_fitting", "filtering", "dataset", "Eps"]]
    
    datasets = ["cluto", "complex9", "letter"]
    filtering = ["none", "real", "zpn"]
    out = None
    for d in datasets:
        for f in filtering:
            snippet = data[(data["filtering"] == f) & (data["dataset"] == d)][["Eps", "model_fitting"]]
            snippet = snippet.rename(columns={
                "model_fitting": "time_{}_{}".format(f, d)
            })
            if out is None:
                out = snippet
                
                print(out.columns)
            else:
                out = pd.merge(
                    out,
                    snippet,
                    on="Eps"
                )
    data = out
    data = data.sort_values(by="Eps")[
        ["Eps"] + [
            "time_{}_{}".format(f, d) for d in datasets for f in filtering
        ]
    ]
    min_values = data.min()
    print(len(data))
    for column in data.columns[1:]:
        data[column] = data[column].apply(
            lambda x: "\textbf{{{:d}}}".format(int(x)) if (x == min_values[column] and "none" not in column) else "{:d}".format(int(x))
        )
    print(data.to_latex(escape=False, index=False))
print_time_for_datasets_for_epsilon()

### Time correlations

In [None]:
def print_time_correlations():
    data = frame[
        (frame["algorithm"] == "tanimoto")
        & (frame["minPts"] == 5)
    ][["model_fitting", "filtering", "dataset", "Eps"]]
    
    datasets = ["cluto", "complex9", "letter"]
    filtering = ["none", "real", "zpn"]
    out = None
    for d in datasets:
        for f in filtering:
            snippet = data[(data["filtering"] == f) & (data["dataset"] == d)][["Eps", "model_fitting"]]
            snippet = snippet.rename(columns={
                "model_fitting": "time_{}_{}".format(f, d)
            })
            if out is None:
                out = snippet
            else:
                out = pd.merge(
                    out,
                    snippet,
                    on="Eps"
                )
    data = out
    data = data.sort_values(by="Eps")[
        ["Eps"] + [
            "time_{}_{}".format(f, d) for d in datasets for f in filtering
        ]
    ]
    times_data = data 
    
    data = frame[
        (frame["algorithm"] == "tanimoto")
        & (frame["minPts"] == 5)
    ][["avg_num_calculations_of_similarity", "filtering", "dataset", "Eps"]]
    
    datasets = ["cluto", "complex9", "letter"]
    filtering = ["none", "real", "zpn"]
    out = None
    for d in datasets:
        for f in filtering:
            snippet = data[(data["filtering"] == f) & (data["dataset"] == d)][["Eps", "avg_num_calculations_of_similarity"]]
            snippet = snippet.rename(columns={
                "avg_num_calculations_of_similarity": "avg_{}_{}".format(f, d)
            })
            if out is None:
                out = snippet
            else:
                out = pd.merge(
                    out,
                    snippet,
                    on="Eps"
                )
    data = out
    data = data.sort_values(by="Eps")[
        ["Eps"] + [
            "avg_{}_{}".format(f, d) for d in datasets for f in filtering
        ]
    ]
    
    correlations_data = data
    output_data = {}
    for column_a, column_b in zip(times_data.columns[1:], correlations_data.columns[1:]):
        print(column_a, column_b)
        correlation = sp_stats.pearsonr(times_data[column_a], correlations_data[column_b])[0]
        output_data["_".join(column_a.split("_")[1:])] = ["{0:.4f}".format(correlation)]
    print(output_data)
    output_data = pd.DataFrame(data=output_data)
    print(output_data.to_latex(escape=False, index=False))
    
print_time_correlations()

### Real vec len

In [None]:
length = 1
fun = lambda x: 0.5 * ((1 + 1 / x) + np.sqrt((1 + 1 / x) ** 2 - 4))
x = np.linspace(0.1, 1.0, num=1000)
fig = plt.figure(figsize=(8, 4))
plt.plot(x, 1 / fun(x) * length, label=r"$$\frac{1}{\alpha}|u|$$")
plt.plot(x, fun(x) * length, label=r"$$\alpha|u|$$")
plt.xlabel(r"$\varepsilon$")
plt.legend()

fig.savefig("../experiments/realveclen.pdf", bbox_inches='tight')

### ZPN vec len

In [None]:
length = 2
fun = lambda x: x
x = np.linspace(0.1, 1.0, num=1000)
fig = plt.figure(figsize=(8, 4))
plt.plot(x, fun(x) * length, label=r"$\varepsilon |u|$")
plt.plot(x, 1 / fun(x) * length, label=r"$\frac{1}{\varepsilon} |u|$")
plt.xlabel(r"$\varepsilon$")
plt.legend()

fig.savefig("../experiments/zpnveclen.pdf", bbox_inches='tight')

In [None]:
import re

def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

In [None]:
BORDER_POINT = "Border"
NOISE_POINT = "Noise"
MULTIPLE_CLUSTERS = "Multiple Clusters"


def read_data(path: str) -> Tuple[np.ndarray, np.ndarray]:
    with open(path) as f:
        lines = [line for line in f.read().split("\n") if len(line) > 0]
    coords = []
    labels = []
    for line in lines:
        components = line.split(",")
        x, y = float(components[1]), float(components[2])
        point_type = int(components[4])
        assignments = components[5:]
        if point_type == -1:
            labels.append(NOISE_POINT)
        elif point_type == 0 and len(assignments) > 1:
            labels.append(MULTIPLE_CLUSTERS)
        elif point_type == 0 and len(assignments) == 1:
            labels.append(BORDER_POINT)
        else:
            labels.append(str(assignments[-1]))

        coords.append((x, y))

    return np.array(coords), np.array(labels)


def visualize(path: str, legend=True):
    x, y = read_data(path)
    colormap = cm.get_cmap("tab10")
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    print("Num multiple border points: {}".format(get_num_border_points_with_multiple_assignments(Path(path))))
    
    for label in natural_sort(np.unique(y)):
        ix = np.where(y == str(label))[0]
        ax.scatter(x[ix, 0], x[ix, 1], s=15, label=label, cmap=colormap)
    if legend:
        ax.legend()
    return fig

fig = visualize("../experiments/sanity_experiment_tanimoto/OUT")
fig.savefig("../experiments/sanity_tanimoto.pdf", bbox_inches='tight')

In [None]:
fig = visualize("../experiments/sanity_experiment_base/OUT")
fig.savefig("../experiments/sanity_base.pdf", bbox_inches='tight')

In [None]:
fig = visualize("../experiments/sanity_experiment_tanimoto_zpn/OUT")
fig.savefig("../experiments/sanity_tanimoto_zpn.pdf", bbox_inches='tight')

In [None]:
fig = visualize("../experiments/simpler_sanity_experiment_tanimoto/OUT")
fig.savefig("../experiments/simpler_sanity_tanimoto.pdf", bbox_inches='tight')

In [None]:
fig = visualize("../experiments/simpler_sanity_experiment_base/OUT")
fig.savefig("../experiments/simpler_sanity_base.pdf", bbox_inches='tight')

In [None]:
fig = visualize("../experiments/complex9_0.999_10_tanimoto_0_none/OUT", legend=False)
fig.legend(bbox_to_anchor=(0.9, 0.9), loc='upper left')
fig.savefig("../experiments/complex9.pdf", bbox_inches='tight')

In [None]:
max_value = 50
pixels = 512

def visualize_tanimoto(num: int = 10):
    values_range = np.linspace(0.001, max_value, num=pixels)
    fig, ax = plt.subplots(1, num, sharey=True, figsize=(num * 1.5, 2))
    u_lengths, v_lengths = np.meshgrid(
        values_range, values_range
    )
    
    dots = u_lengths * v_lengths
    measure = dots / (u_lengths ** 2 + v_lengths ** 2 - dots)
    
    v_min = measure.min()
    v_max = measure.max()
    
    for i, angle in enumerate(np.linspace(0.0, math.pi / 2, num=num)):
        dots = u_lengths * v_lengths * np.cos(angle)
        measure = dots / (u_lengths ** 2 + v_lengths ** 2 - dots)
        im = ax[i].imshow(
            measure, 
            cmap=cm.jet,
            vmin=v_min, 
            vmax=v_max,
            extent=[0.001, max_value, 0.001, max_value],
            origin="lower",
        )
        ax[i].set_title(r"$\angle = {:.3f}$".format(angle))
        ax[i].set_xlabel(r"$|u|$")
        if i == 0:
            ax[i].set_ylabel(r"$|v|$")
            
        if i == num - 1:
            cb = fig.colorbar(im, ax=ax[i])
            cb.set_label(r"$\texttt{Tanimoto measure}$")
    
    fig.tight_layout()
    return fig

fig = visualize_tanimoto(8)
fig.savefig("../experiments/tanimoto.pdf", bbox_inches='tight', dpi=200)

In [None]:
final_folders = [
    "cluto-t7-10k_0.99_100_tanimoto_0_real",
    "complex9_0.99_100_tanimoto_0_real",
    "letter_0.99_25_tanimoto_0_real",
]
final_data = []
for file in tqdm.tqdm(files):
    if file.parent.name not in final_folders:
        continue
    with open(file) as f:
        dataset, eps, min_pts, algo, _, filtering = file.parent.name.split("_")
        
        sample = yaml.safe_load(f.read().replace("\t", ""))
        
        sample.update({
            "dataset": dataset if "cluto" not in dataset else "cluto",
            "algorithm": algo,
            "filtering": filtering,
            "model_fitting": sample["-model_fitting"],
            "data_reading": sample["-data_reading"],
            "davis_bouldin": float(sample["davis_bouldin"]),
        })
        
        class_assignments = get_class_assignments(file.parent / "OUT")
        total_border_points = seq(class_assignments).filter(lambda x: len(x) > 1).map(len).sum()
        
        sample["\#Multi Borders"] = total_border_points
        
        
        del sample["-model_fitting"]
        del sample["-data_reading"]
        del sample["runtimes_miliseconds"]
        del sample["input_file"]
        final_data.append(sample)
        
        
        
final_frame = pd.DataFrame(final_data)
final_frame = final_frame.rename(
    columns={
        'purity': 'Purity',
        "davis_bouldin": "Davis-Bouldin",
        "rand": "RAND",
        "silhouette": "Silhouette"
    }
)
final_frame = final_frame.fillna(0)

In [None]:
final_frame

### Final results

In [None]:
def print_final_results():
    data = final_frame
    data = data.sort_values(by="dataset")[
        ["dataset", "minPts", "Eps"] + metrics_columns + ["model_fitting", "avg_num_calculations_of_similarity"]
    ]
    max_values = data.max()
    print(len(data))
    for column in data.columns[1:]:
        data[column] = data[column].apply(
            lambda x: "{:.4f}".format(x)
        )
    print(data.to_latex(escape=False, index=False))
print_final_results()

In [None]:
fig = visualize(f"../experiments/{final_folders[0]}/OUT")
fig.savefig("../experiments/final_cluto.pdf", bbox_inches='tight')

In [None]:
fig = visualize(f"../experiments/{final_folders[1]}/OUT")
fig.savefig("../experiments/final_complex9.pdf", bbox_inches='tight')