In [None]:
# std
import os
import sys
import inspect
import time
import pathlib
from math import sqrt
from math import log2
# packgaes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline

# packages
from matplotlib.colors import ListedColormap

# for selection the right path
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from common.DataParser import parse_superconductivity
from common.model_trainer_reg import *
from common.regression_plotfunctions import *


In [None]:
def load_glob(glob, concat=True):
    liste = [pd.read_csv(file) for file in glob]
    name = [file for file in glob]
    repr(name)
    if concat:
        return pd.concat(liste)
    return liste, name

In [None]:
# Get the data
cwd = pathlib.Path(os.getcwd())
data_subdir = cwd / "out"
plot_dir = cwd / "out" / "plots"

project = {
    "sub_keys": ["path", "data"], 
    "KNN": {"path": "a/path", "data": pd.DataFrame()}
}

for name in ["KNN", "RF", "SGD"]:
    data = data_subdir / name
    try:
        try:
            data_dict = {
                    "my": load_glob(data.glob("my_CV_*.csv"), concat=True),
                    "sklearn": load_glob(data.glob("sklearn_CV_*.csv"), concat=True)
                }
        except:
            data_dict = {
                    "sklearn": load_glob(data.glob("sklearn_CV_*.csv"), concat=True)
                }
        project[name] = {
            "path": data,
            "data": data_dict
            }
    except Exception as e:
        project[name] = {
            "path": data,
            "data": f"couldn't load because of error:\n{e}"
            }

project

In [None]:
def print_styling(figsize=(14,8)):

    plt.rc('figure', figsize=figsize) 
    SMALL_SIZE = 15
    MEDIUM_SIZE = 18
    BIGGER_SIZE = 26

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('lines', linewidth=2)

    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:

def plot_CV_with_Std(df, y = "R2_score", regressor = "sklearn", titel = "R2-Score", SaveName = False):
    kmax  = np.max(df["k"])
    list_k = np.linspace(1, kmax, num=kmax)
    std_list = np.zeros(kmax)
    mean_list = np.zeros(kmax)
    for k in list_k:
        df_tmp = df[df["k"] == k]
        std_list[int(k-1)] = np.std(df_tmp[y])
        mean_list[int(k-1)] = np.mean(df_tmp[y])
    plt.plot(list_k, mean_list, '-', label = regressor)
    plt.fill_between(list_k, mean_list - std_list, mean_list + std_list, alpha=0.2)
    plt.grid()
    plt.legend()
    plt.xlabel("k-splits")
    plt.title(titel)
    plt.ylabel(y)
    if SaveName:
        plt.savefig(SaveName)

In [None]:
df = project["RF"]["data"]["sklearn"]
display(df)
print_styling()
plot_CV_with_Std(df)