In [None]:
# std
import os
import sys
import inspect
import time
import pathlib
from math import sqrt
from math import log2
# packgaes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline

# packages
from matplotlib.colors import ListedColormap

# for selection the right path
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from common.DataParser import parse_superconductivity
from common.model_trainer_reg import *
#from common.regression_plotfunctions import *

FORMAT = "png"

In [None]:
def load_glob(glob, concat=True):
    liste = [pd.read_csv(file) for file in glob]
    name = [file for file in glob]
    repr(name)
    if concat:
        return pd.concat(liste)
    return liste, name

In [None]:
all_params = \
{
    "SGD": 
    {
        "my": {"alpha" : [0.0001, 0.005], "max_iter": [1000]},
        "sklearn": {"alpha" : [0.0001, 0.001, 0.01]}
    },
    "KNN": 
    {
        "my": 
        {
            "n_neighbors" : [5, 10],
            "p": [1, 2],
            "chunk_size": [1, 4]
        },
        "sklearn": 
        {
            "n_neighbors" : [5, 10],
            "p": [1, 2],
            "weights" : ["uniform"],
            "algorithm": ["brute", "kd_tree", "ball_tree"]
        }
    },
    "RF": 
    {
        "sklearn":{"n_estimators" : [10, 100]}
    },
    "DT": 
    {
        "sklearn":{"criterion": ["mse"],
            "max_features": ["auto", "sqrt","log2"]}
    }
}

In [None]:
# Get the data
cwd = pathlib.Path(os.getcwd())
data_subdir = cwd / "out"
plot_dir = cwd / "out" / "plots"

project = {
    "sub_keys": ["path", "data"], 
    "KNN": {"path": "a/path", "data": pd.DataFrame()}
}

for name in ["KNN", "SGD", "RF", "DT"]:
    data = data_subdir / name
    try:
        try:
            data_dict = {
                    "my": load_glob(data.glob("my_CV_*.csv"), concat=True),
                    "sklearn": load_glob(data.glob("sklearn_CV_*.csv"), concat=True)
                }
            param_dict = all_params[name]
        except:
            data_dict = {
                    "sklearn": load_glob(data.glob("sklearn_CV_*.csv"), concat=True)
                }
            param_dict = all_params[name]
        project[name] = {
            "path": data,
            "data": data_dict,
            "params": param_dict
            }
    except Exception as e:
        project[name] = {
            "path": data,
            "data": f"couldn't load because of error:\n{e}",
            "params": f"couldn't load because of error:\n{e}"
            }
        print(f"Could not load data for {name} as expected. You should look into that...")

In [None]:
def print_styling(figsize=(10,8)):
    plt.rc('figure', figsize=figsize) 
    plt.style.use("seaborn")
    sns.set_palette("dark")
    SMALL_SIZE = 15
    MEDIUM_SIZE = 18
    BIGGER_SIZE = 26

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('lines', linewidth=2)

    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
test = "time_a"
test2 = "inference_time"

for t in [test, test2]:
    if "time" in t:
        print(t)

In [None]:

def plot_CV_with_Std(df, y = "R2_score", ax=None, regressor = "sklearn", titel = "R2-Score", SaveName = False, plot_std=True):
    if ax==None:
        ax = plt.gca()
    plt.sca(ax)
    kmax  = np.max(df["k"])
    list_k = np.linspace(1, kmax, num=kmax)
    std_list = np.zeros(kmax)
    mean_list = np.zeros(kmax)
    for k in list_k:
        df_tmp = df[df["k"] == k]
        std_list[int(k-1)] = np.std(df_tmp[y])
        mean_list[int(k-1)] = np.mean(df_tmp[y])
    plt.plot(list_k, mean_list, '-', label = regressor)
    if plot_std:
        plt.fill_between(list_k, mean_list - std_list, mean_list + std_list, alpha=0.2)
    plt.grid(True)
    plt.legend()
    plt.xlabel("k-splits")
    plt.suptitle(titel)
    if "time" in y:
        plt.ylabel("time [s]")
    else:
        plt.ylabel(y)
    if SaveName:
        plt.savefig(SaveName)

def get_best_params(df, params, metric="R2_score"):
    if params!=None:
        return df[metric].max(), df.loc[df[metric].idxmax(),params]
    return df[metric].max(), df.loc[df[metric].idxmax(),:]

def get_best_params_cv(df, metric="R2_score", params=None):
    keys, values = zip(*params.items())
    permutations_dict = [
        dict(zip(keys, v)) for v in itertools.product(*values)
    ]
    results = []
    for pset in permutations_dict:
        string = ""
        for key, val in pset.items():
            string += f"{key} == \"{val}\" & "
        string = string[:-2]
        #print(string)
        tmp = df.query(string)
        r2 = np.mean(tmp[metric])
        std = np.mean(tmp[metric])
        results.append({"mean": r2, "std": std,"params": pset})
    best_score = results[0]["mean"]
    best = results[0]
    for res in results:
        if res["mean"] > best_score:
            best_score = res["mean"]
            best = res
    return best, results

def get_worst_params_cv(df, metric="R2_score", params=None):
    keys, values = zip(*params.items())
    permutations_dict = [
        dict(zip(keys, v)) for v in itertools.product(*values)
    ]
    results = []
    for pset in permutations_dict:
        string = ""
        for key, val in pset.items():
            string += f"{key} == \"{val}\" & "
        string = string[:-2]
        #print(string)
        tmp = df.query(string)
        r2 = np.mean(tmp[metric])
        std = np.mean(tmp[metric])
        results.append({"mean": r2, "std": std,"params": pset})
    best_score = results[0]["mean"]
    best = results[0]
    for res in results:
        if res["mean"] < best_score:
            best_score = res["mean"]
            best = res
    return best, results

def print_best(best, imp="sklearn"):
    print(f"Best [{imp}]:")
    print(best)

def print_worst(worst, imp="sklearn"):
    print(f"worst [{imp}]:")
    print(worst)

# KNN

In [None]:
print_styling((10,8))
ax = plt.gca()
df = project["KNN"]["data"]["sklearn"]
params = project["KNN"]["params"]["sklearn"]
plot_CV_with_Std(df, regressor="KNN [sklearn]", ax=ax)
best1, means1 = get_best_params_cv(df, params=params)

df = project["KNN"]["data"]["my"]
params = project["KNN"]["params"]["my"]
best2, means2 = get_best_params_cv(df, params=params)
plot_CV_with_Std(df, regressor="KNN [my]", ax=ax)

## Print best model
print_best(best1, "sklearn")
print_best(best2, "my")

## Save
plt.savefig(plot_dir / f"sc_KNN_r2.{FORMAT}", bbox_inches='tight', pad_inches=0)

In [None]:
metric = "inference_time"
model = "KNN"

title = metric.capitalize().replace("_", " ")

print_styling((10,8))
ax = plt.gca()
df = project[model]["data"]["sklearn"]
params = project[model]["params"]["sklearn"]
best1, means1 = get_best_params_cv(df, metric=metric, params=params)
worst1, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y="inference_time", regressor=f"{model}[sklearn]", titel=title,ax=ax)

df = project[model]["data"]["my"]
params = project[model]["params"]["my"]
best2, means2 = get_best_params_cv(df, metric=metric, params=params)
worst2, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y="inference_time", regressor=f"{model} [my]",  titel=title,ax=ax)

## Print best model
print_best(best1, "sklearn")
print_best(best2, "my")
## Print worst model
print_worst(worst1, "sklearn")
print_worst(worst2, "my")

## Save
plt.savefig(plot_dir / f"sc_{model}_{metric}.{FORMAT}", bbox_inches='tight', pad_inches=0)

In [None]:
metric = "train_time"
model = "KNN"

title = metric.capitalize().replace("_", " ")

print_styling((10,8))
ax = plt.gca()
df = project[model]["data"]["sklearn"]
params = project[model]["params"]["sklearn"]
best1, means1 = get_best_params_cv(df, metric=metric, params=params)
worst1, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y="inference_time", regressor=f"{model}[sklearn]", titel=title,ax=ax)

df = project[model]["data"]["my"]
params = project[model]["params"]["my"]
best2, means2 = get_best_params_cv(df, metric=metric, params=params)
worst2, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y="inference_time", regressor=f"{model} [my]",  titel=title,ax=ax)

## Print best model
print_best(best1, "sklearn")
print_best(best2, "my")
## Print worst model
print_worst(worst1, "sklearn")
print_worst(worst2, "my")

## Save
plt.savefig(plot_dir / f"sc_{model}_{metric}.{FORMAT}", bbox_inches='tight', pad_inches=0)

# SGD

In [None]:
print_styling((10,8))
ax = plt.gca()
df = project["SGD"]["data"]["sklearn"]
params = project["SGD"]["params"]["sklearn"]
plot_CV_with_Std(df, regressor="SGD [sklearn]", ax=ax)
best1, means1 = get_best_params_cv(df, params=params)

df = project["SGD"]["data"]["my"]
params = project["SGD"]["params"]["my"]
best2, means2 = get_best_params_cv(df, params=params)
plot_CV_with_Std(df, regressor="SGD [my]", ax=ax)

## Print best model
print_best(best1, "sklearn")
print_best(best2, "my")

## Save
plt.savefig(plot_dir / f"sc_SGD_r2.{FORMAT}", bbox_inches='tight', pad_inches=0)

In [None]:
metric = "inference_time"
model = "SGD"

title = metric.capitalize().replace("_", " ")

print_styling((10,8))
ax = plt.gca()
df = project[model]["data"]["sklearn"]
params = project[model]["params"]["sklearn"]
best1, means1 = get_best_params_cv(df, metric=metric, params=params)
worst1, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y=metric, regressor=f"{model} [sklearn]", titel=title,ax=ax)

df = project[model]["data"]["my"]
params = project[model]["params"]["my"]
best2, means2 = get_best_params_cv(df, metric=metric, params=params)
worst2, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y=metric, regressor=f"{model} [my]",  titel=title,ax=ax)

## Print best model
print_best(best1, "sklearn")
print_best(best2, "my")
## Print worst model
print_worst(worst1, "sklearn")
print_worst(worst2, "my")

## Save
plt.savefig(plot_dir / f"sc_{model}_{metric}.{FORMAT}", bbox_inches='tight', pad_inches=0)

In [None]:
metric = "train_time"
model = "SGD"

title = metric.capitalize().replace("_", " ")

print_styling((10,8))
ax = plt.gca()
df = project[model]["data"]["sklearn"]
params = project[model]["params"]["sklearn"]
best1, means1 = get_best_params_cv(df, metric=metric, params=params)
worst1, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y=metric, regressor=f"{model} [sklearn]", titel=title,ax=ax)

df = project[model]["data"]["my"]
params = project[model]["params"]["my"]
best2, means2 = get_best_params_cv(df, metric=metric, params=params)
worst2, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y=metric, regressor=f"{model} [my]",  titel=title,ax=ax)

## Print best model
print_best(best1, "sklearn")
print_best(best2, "my")
## Print worst model
print_worst(worst1, "sklearn")
print_worst(worst2, "my")

## Save
plt.savefig(plot_dir / f"sc_{model}_{metric}.{FORMAT}", bbox_inches='tight', pad_inches=0)

# RF + DT

In [None]:
print_styling()
ax = plt.gca()
df = project["RF"]["data"]["sklearn"]
params = project["RF"]["params"]["sklearn"]
best2, means2 = get_best_params_cv(df, params=params)
plot_CV_with_Std(df, regressor="RF [sklearn]", ax=ax)

df = project["DT"]["data"]["sklearn"]
params = project["DT"]["params"]["sklearn"]
best2, means2 = get_best_params_cv(df, params=params)
plot_CV_with_Std(df, regressor="DT [sklearn]", ax=ax)

## Print best model
print_best(best1, "sklearn")
print_best(best2, "sklearn")

## Save
plt.savefig(plot_dir / f"sc_DT+RF_r2.{FORMAT}", bbox_inches='tight', pad_inches=0)

In [None]:
metric = "inference_time"
model = "SGD"

title = metric.capitalize().replace("_", " ")

print_styling((10,8))
ax = plt.gca()
df = project[model]["data"]["sklearn"]
params = project[model]["params"]["sklearn"]
best1, means1 = get_best_params_cv(df, metric=metric, params=params)
worst1, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y=metric, regressor=f"{model} [sklearn]", titel=title,ax=ax)

df = project[model]["data"]["my"]
params = project[model]["params"]["my"]
best2, means2 = get_best_params_cv(df, metric=metric, params=params)
worst2, _ = get_worst_params_cv(df, metric=metric, params=params)
plot_CV_with_Std(df, y=metric, regressor=f"{model} [my]",  titel=title,ax=ax)

## Print best model
print_best(best1, "sklearn")
print_best(best2, "my")
## Print worst model
print_worst(worst1, "sklearn")
print_worst(worst2, "my")

## Save
plt.savefig(plot_dir / f"sc_{model}_{metric}.{FORMAT}", bbox_inches='tight', pad_inches=0)

In [None]:
metric = "inference_time"
for model in project:
    if model =="sub_keys":
        continue
    print(model)

    title = metric.capitalize().replace("_", " ")

    print_styling((10,8))
    ax = plt.gca()
    df = project[model]["data"]["sklearn"]
    params = project[model]["params"]["sklearn"]
    best1, means1 = get_best_params_cv(df, metric=metric, params=params)
    worst1, _ = get_worst_params_cv(df, metric=metric, params=params)
    plot_CV_with_Std(df, y=metric, regressor=f"{model} [sklearn]", titel=title,ax=ax)

    switch = False
    try:
        df = project[model]["data"]["my"]
        params = project[model]["params"]["my"]
        best2, means2 = get_best_params_cv(df, metric=metric, params=params)
        worst2, _ = get_worst_params_cv(df, metric=metric, params=params)
        plot_CV_with_Std(df, y=metric, regressor=f"{model} [my]",  titel=title,ax=ax)
        switch = True
    except:
        pass

    ## Print best model
    print_best(best1, f"{model}(sklearn)")
    if switch:
        print_best(best2, f"{model}(my)")
    ## Print worst model
    print_worst(worst1, f"{model}(sklearn)")
    if switch:
        print_worst(worst2, f"{model}(my)")
ax.set_yscale("log")
## Save
plt.savefig(plot_dir / f"sc_all_inference_times.{FORMAT}", bbox_inches='tight', pad_inches=0)

In [None]:
metric = "train_time"
for model in project:
    if model =="sub_keys":
        continue
    print(model)

    title = metric.capitalize().replace("_", " ")

    print_styling((10,8))
    ax = plt.gca()
    df = project[model]["data"]["sklearn"]
    params = project[model]["params"]["sklearn"]
    best1, means1 = get_best_params_cv(df, metric=metric, params=params)
    worst1, _ = get_worst_params_cv(df, metric=metric, params=params)
    no = True
    if model=="DT":
        no = False
    plot_CV_with_Std(df, y=metric, regressor=f"{model} [sklearn]", titel=title, ax=ax, plot_std=no)

    switch = False
    try:
        df = project[model]["data"]["my"]
        params = project[model]["params"]["my"]
        best2, means2 = get_best_params_cv(df, metric=metric, params=params)
        worst2, _ = get_worst_params_cv(df, metric=metric, params=params)
        plot_CV_with_Std(df, y=metric, regressor=f"{model} [my]",  titel=title,ax=ax)
        switch = True
    except:
        pass

    ## Print best model
    print_best(best1, f"{model}(sklearn)")
    if switch:
        print_best(best2, f"{model}(my)")
    ## Print worst model
    print_worst(worst1, f"{model}(sklearn)")
    if switch:
        print_worst(worst2, f"{model}(my)")
ax.set_yscale("log")
## Save
plt.savefig(plot_dir / f"sc_all_train_times.{FORMAT}", bbox_inches='tight', pad_inches=0)