# Performance analysis of KNN algorithm

In [None]:
# std
import os
import sys
import inspect
import time
import pathlib
import glob
from math import sqrt
from math import log2
# packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline

# packages
from matplotlib.colors import ListedColormap

# for selection the right path
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from common.regression_plotfunctions import *

from KNNRegressor import KNNRegressor

In [None]:
dataset_dir_names = ["Moneyball", "Metro", "Superconductivity"]

In [None]:
cwd = pathlib.Path(os.getcwd())
project_path = cwd.parent
data_subdir = "out/runtimes"
plot_dir = cwd / "out"


Moneyball = project_path / "Moneyball"
Metro = project_path / "Metro"
Superconductivity = project_path / "Superconductivity"

project = {
    "Moneyball": {"path": Moneyball},
    "Metro": {"path": Metro},
    "Superconductivity": {"path": Superconductivity}
}

for key, val in project.items():
    val["data_path"] = val["path"] / data_subdir
project

In [None]:
for name in project:
    if name not in ["Superconductivity", "Metro"]:
        continue
    data = pd.concat([pd.read_csv(file) for file in list(project[name]["data_path"].glob("*.csv"))], keys=["sklearn", "myKNN"], names=["implementation", "subindex"])
    data["total_time"] = data["train_time"] + data["inference_time"]
    project[name]["data"] = data
    print(name)
    display(data)

In [None]:
y = "inference_time"

def plot_myknn(y="inference_time"):
    fig, ax = plt.subplots(figsize=(10,8))
    N = results["N"].to_numpy()
    D = results["D"].unique()

    plt.plot(N, N*1e-4, ls="--", color="red",label="O(N)");
    plt.plot(N, 1e-6*np.power(N, 2), ls="--", color="blue",label="O(N^2)");
    plt.plot(N, np.log10(N)*1e-1, ls="--", color="green",label="O(log(N))");
    sns.lineplot(x="N", y=y, hue="chunk_size", data=results, ax=ax);

    plt.grid()
    ax.set_xscale("log")
    ax.set_yscale("log")


    title = y.replace("_", " ").capitalize()
    plt.suptitle(f"{title} [{implementation}]", fontsize=36)
    ax.set_title(f"D={D}")

    plt.savefig(plot_dir / f"{dataset_name}_{implementation}_{y}.png")

def plot_sklearnknn(y="inference_time"):
    fig, ax = plt.subplots(figsize=(10,8))
    N = results["N"].to_numpy()
    D = results["D"].unique()

    plt.plot(N, N*1e-5, ls="--", color="red",label="O(N)");
    if y != "train_time":
        plt.plot(N, 1e-8*np.power(N, 2), ls="--", color="blue",label="O(N^2)");
    plt.plot(N, np.log10(N)*1e-2, ls="--", color="green",label="O(log(N))");
    sns.lineplot(x="N", y=y, hue="algorithm", data=results, ax=ax);

    plt.grid()
    ax.set_xscale("log")
    ax.set_yscale("log")

    title = y.replace("_", " ").capitalize()
    plt.suptitle(f"{title} [{implementation}]", fontsize=36)
    ax.set_title(f"D={D}")

    plt.savefig(plot_dir / f"{dataset_name}_{implementation}_{y}.png")

# Metro

In [None]:
data = project["Metro"]["data"]
dataset_name = "Metro"
data

## sklearn

In [None]:
implementation = "sklearn"
results = data.xs(implementation)
results.head()

In [None]:
plot_sklearnknn("train_time")
plt.show()
plt.clf()
plot_sklearnknn("inference_time")
plt.show()
plt.clf()
plot_sklearnknn("total_time")

## MyKNN

In [None]:
implementation = "myKNN"

results = data.xs(implementation)
results

In [None]:
plot_myknn("train_time")
plt.show()
plt.clf()
plot_myknn("inference_time")
plt.show()
plt.clf()
plot_myknn("total_time")

# Superconductivity

In [None]:
data = project["Superconductivity"]["data"]
dataset_name = "Superconductivity"
data

## sklearn

In [None]:
implementation = "sklearn"
results = data.xs(implementation)
results.head()

In [None]:
plot_sklearnknn("train_time")
plt.show()
plt.clf()
plot_sklearnknn("inference_time")
plt.show()
plt.clf()
plot_sklearnknn("total_time")

# myKNN

In [None]:
implementation = "myKNN"

results = data.xs(implementation)
results

In [None]:
plot_myknn("train_time")
plt.show()
plt.clf()
plot_myknn("inference_time")
plt.show()
plt.clf()
plot_myknn("total_time")