In [None]:
from niapy.algorithms.basic import (
    BatAlgorithm,
    ParticleSwarmAlgorithm,
    ParticleSwarmOptimization,
)
from tools.algorithms.fa import FireflyAlgorithm
from niapy.problems.schwefel import Schwefel
from sklearn.preprocessing import StandardScaler

import sklearn
from scipy import spatial, stats
import torch
from torch import nn
from PIL import Image
from matplotlib import pyplot as plt
import os
import json
import pygad
import numpy as np
import pandas as pd
import seaborn
from tools.ml_tools import get_data_loaders, nn_test, nn_train, NNType, LSTMClassifier, LinearClassifier
from util.optimization_data import SingleRunData
from util.pop_diversity_metrics import PopDiversityMetric
from tools.optimization_tools import optimization_runner
from tools.metaheuristic_similarity_analyzer import MetaheuristicSimilarityAnalyzer

from util.constants import (
    RNG_SEED,
    BATCH_SIZE,
    DATASET_PATH,
    EPOCHS,
    POP_SIZE,
    MAX_EVALS,
    NUM_RUNS,
    OPTIMIZATION_PROBLEM,
    META_GA_CROSSOVER_PROBABILITY,
    META_GA_CROSSOVER_TYPE,
    META_GA_GENERATIONS,
    META_GA_K_TOURNAMENT,
    META_GA_KEEP_ELITISM,
    META_GA_MUTATION_NUM_GENES,
    META_GA_MUTATION_TYPE,
    META_GA_PARENT_SELECTION_TYPE,
    META_GA_PERCENT_PARENTS_MATING,
    META_GA_SOLUTIONS_PER_POP,
    GENE_SPACES,
    TARGET_GENE_SPACES,
    POP_DIVERSITY_METRICS,
    INDIV_DIVERSITY_METRICS,
    N_PCA_COMPONENTS,
    LSTM_NUM_LAYERS,
    LSTM_HIDDEN_DIM,
    LSTM_DROPOUT,
    VAL_SIZE,
    TEST_SIZE,
)

BASE_PATH = "./archive/target_performance_similarity/01-29_15.27.22_WVCPSO_Schwefel"
_DATASET_PATH = f"{BASE_PATH}/dataset"
DATASET_PATH = f"{_DATASET_PATH}/0_subset"
MSA_PATH = f"{BASE_PATH}/msa_obj"

algorithms_to_plot = ['FA', 'WVCPSO', 'PSO', 'BA']

execute_training = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
print("CPUs: ", os.cpu_count())

### Fitness values by subset

In [None]:
dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

mean_fitness_1 = []
min_fitness_1 = []
std_fitness_1 = []

mean_fitness_2 = []
min_fitness_2 = []
std_fitness_2 = []

wilcoxon = []

al1 = ""
al2 = ""

for idx in range(len(subsets)):
    best_al1 = []
    best_al2 = []
    subset = f"{idx}_subset"
    for idx, algorithm in enumerate(os.listdir(os.path.join(dataset_path, subset))):
        best_fitness_values = []
        for problem in os.listdir(os.path.join(dataset_path, subset, algorithm)):
            runs = os.listdir(os.path.join(dataset_path, subset, algorithm, problem))
            runs.sort()
            for run in runs:
                run_path = os.path.join(dataset_path, subset, algorithm, problem, run)
                srd = SingleRunData.import_from_json(run_path)
                best_fitness_values.append(srd.best_fitness)
        if idx == 0:
            al1 = algorithm
            mean_fitness_1.append(round(np.mean(best_fitness_values), 2))
            min_fitness_1.append(round(np.amin(best_fitness_values), 2))
            std_fitness_1.append(round(np.std(best_fitness_values), 2))
            best_al1 = best_fitness_values
        else:
            al2 = algorithm
            mean_fitness_2.append(round(np.mean(best_fitness_values), 2))
            min_fitness_2.append(round(np.amin(best_fitness_values), 2))
            std_fitness_2.append(round(np.std(best_fitness_values), 2))
            best_al2 = best_fitness_values

    shapiro_1 = stats.shapiro(best_al1)
    shapiro_2 = stats.shapiro(best_al2)
    if shapiro_1[1] > 0.05 and shapiro_2[1] > 0.05:
        stats_levene = stats.levene(best_al1, best_al2)
        if stats_levene[1] > 0.05:
            print(f"obe normalno, varianci enaki")
        else:
            print(f"obe normalno, različni")
    else:
        print("ne")

    data = stats.wilcoxon(best_al1, best_al2)
    wilcoxon.append(data[1])

print(al1)
for min, mean, std in zip(min_fitness_1, mean_fitness_1, std_fitness_1):
    print(f"{min} & {mean} & {std}")

print(f"\n & {round(np.min(min_fitness_1), 2)} & {round(np.min(mean_fitness_1), 2)} & {round(np.min(std_fitness_1), 2)}")
print(f" & {round(np.mean(min_fitness_1), 2)} & {round(np.mean(mean_fitness_1), 2)} & {round(np.mean(std_fitness_1), 2)}")
print(f" & {round(np.max(min_fitness_1), 2)} & {round(np.max(mean_fitness_1), 2)} & {round(np.max(std_fitness_1), 2)}")
print(f" & {round(np.std(min_fitness_1), 2)} & {round(np.std(mean_fitness_1), 2)} & {round(np.std(std_fitness_1), 2)}")

print("\n", al2)
for min, mean, std in zip(min_fitness_2, mean_fitness_2, std_fitness_2):
    print(f"{min} & {mean} & {std}")

print(f"\n & {round(np.min(min_fitness_2), 2)} & {round(np.min(mean_fitness_2), 2)} & {round(np.min(std_fitness_2), 2)}")
print(f" & {round(np.mean(min_fitness_2), 2)} & {round(np.mean(mean_fitness_2), 2)} & {round(np.mean(std_fitness_2), 2)}")
print(f" & {round(np.max(min_fitness_2), 2)} & {round(np.max(mean_fitness_2), 2)} & {round(np.max(std_fitness_2), 2)}")
print(f" & {round(np.std(min_fitness_2), 2)} & {round(np.std(mean_fitness_2), 2)} & {round(np.std(std_fitness_2), 2)}")

for w in wilcoxon:
    if w < 0.05:
        print("Različna")
    else:
        print("Enaka")

In [None]:
dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

mean_fitness_1 = []
min_fitness_1 = []
std_fitness_1 = []

mean_fitness_2 = []
min_fitness_2 = []
std_fitness_2 = []

wilcoxon = []

al1 = ""
al2 = ""

for idx in range(len(subsets)):
    best_al1 = []
    best_al2 = []
    subset = f"{idx}_subset"
    for idx, algorithm in enumerate(os.listdir(os.path.join(dataset_path, subset))):
        best_fitness_values = []
        for problem in os.listdir(os.path.join(dataset_path, subset, algorithm)):
            runs = os.listdir(os.path.join(dataset_path, subset, algorithm, problem))
            runs.sort()
            for run in runs:
                run_path = os.path.join(dataset_path, subset, algorithm, problem, run)
                srd = SingleRunData.import_from_json(run_path)
                best_fitness_values.append(srd.best_fitness)
        if idx == 0:
            al1 = algorithm
            mean_fitness_1.append(round(np.mean(best_fitness_values), 2))
            min_fitness_1.append(round(np.amin(best_fitness_values), 2))
            std_fitness_1.append(round(np.std(best_fitness_values), 2))
            best_al1 = best_fitness_values
        else:
            al2 = algorithm
            mean_fitness_2.append(round(np.mean(best_fitness_values), 2))
            min_fitness_2.append(round(np.amin(best_fitness_values), 2))
            std_fitness_2.append(round(np.std(best_fitness_values), 2))
            best_al2 = best_fitness_values


print(al1)
for min, mean, std in zip(min_fitness_1, mean_fitness_1, std_fitness_1):
    line = ""
    if min == np.min(min_fitness_1):
        line += "\\textbf{" + f"{min}" + "} & "
    else:
        line += f"{min} & "
    
    if mean == np.min(mean_fitness_1):
        line += "\\textbf{" + f"{mean}" + "} & "
    else:
        line += f"{mean} & "

    if std == np.min(std_fitness_1):
        line += "\\textbf{" + f"{std}" + "}"
    else:
        line += f"{std}"
    print(line)

print(f"\n & {round(np.min(min_fitness_1), 2)} & {round(np.min(mean_fitness_1), 2)} & {round(np.min(std_fitness_1), 2)}")
print(f" & {round(np.mean(min_fitness_1), 2)} & {round(np.mean(mean_fitness_1), 2)} & {round(np.mean(std_fitness_1), 2)}")
print(f" & {round(np.max(min_fitness_1), 2)} & {round(np.max(mean_fitness_1), 2)} & {round(np.max(std_fitness_1), 2)}")
print(f" & {round(np.std(min_fitness_1), 2)} & {round(np.std(mean_fitness_1), 2)} & {round(np.std(std_fitness_1), 2)}")

print("\n", al2)
for min, mean, std in zip(min_fitness_2, mean_fitness_2, std_fitness_2):
    line = ""
    if min == np.min(min_fitness_2):
        line += "\\textbf{" + f"{min}" + "} & "
    else:
        line += f"{min} & "
    
    if mean == np.min(mean_fitness_2):
        line += "\\textbf{" + f"{mean}" + "} & "
    else:
        line += f"{mean} & "

    if std == np.min(std_fitness_2):
        line += "\\textbf{" + f"{std}" + "}"
    else:
        line += f"{std}"
    print(line)

print(f"\n & {round(np.min(min_fitness_2), 2)} & {round(np.min(mean_fitness_2), 2)} & {round(np.min(std_fitness_2), 2)}")
print(f" & {round(np.mean(min_fitness_2), 2)} & {round(np.mean(mean_fitness_2), 2)} & {round(np.mean(std_fitness_2), 2)}")
print(f" & {round(np.max(min_fitness_2), 2)} & {round(np.max(mean_fitness_2), 2)} & {round(np.max(std_fitness_2), 2)}")
print(f" & {round(np.std(min_fitness_2), 2)} & {round(np.std(mean_fitness_2), 2)} & {round(np.std(std_fitness_2), 2)}")


In [None]:
analyzer = MetaheuristicSimilarityAnalyzer.import_from_pkl(MSA_PATH)
analyzer.msa_info()
print(analyzer.similarity, "\n")
for i, solution in enumerate(analyzer.optimized_solutions):
    print(f"{i} {np.round(solution, 2)}")
print("")
for i, solution in enumerate(analyzer.target_solutions):
    print(f"{i} {np.round(solution, 2)}")


print("min: ", " & ".join(map(str, np.round(np.min(np.array(analyzer.target_solutions), axis=0), 2))), "&", " & ".join(map(str, np.round(np.min(np.array(analyzer.optimized_solutions), axis=0), 2))))
print("avg: ", " & ".join(map(str, np.round(np.mean(np.array(analyzer.target_solutions), axis=0), 2))), "&", " & ".join(map(str, np.round(np.mean(np.array(analyzer.optimized_solutions), axis=0), 2))))
print("max: ", " & ".join(map(str, np.round(np.max(np.array(analyzer.target_solutions), axis=0), 2))), "&", " & ".join(map(str, np.round(np.max(np.array(analyzer.optimized_solutions), axis=0), 2))))
print("std: ", " & ".join(map(str, np.round(np.std(np.array(analyzer.target_solutions), axis=0), 2))), "&", " & ".join(map(str, np.round(np.std(np.array(analyzer.optimized_solutions), axis=0), 2))))

In [None]:
analyzer = MetaheuristicSimilarityAnalyzer.import_from_pkl(MSA_PATH)

for target, optimized in zip(analyzer.target_solutions, analyzer.optimized_solutions):
    line = ""
    for t in target:
        line += f"{round(t, 2)} & "
    for idx, o in enumerate(optimized):
        if idx == len(optimized) - 1:
            line += f"{round(o, 2)}"
        else:
            line += f"{round(o, 2)} & "
    print(line)

print("")
print("min: ", " & ".join(map(str, np.round(np.min(np.array(analyzer.target_solutions), axis=0), 2))), "&", " & ".join(map(str, np.round(np.min(np.array(analyzer.optimized_solutions), axis=0), 2))))
print("avg: ", " & ".join(map(str, np.round(np.mean(np.array(analyzer.target_solutions), axis=0), 2))), "&", " & ".join(map(str, np.round(np.mean(np.array(analyzer.optimized_solutions), axis=0), 2))))
print("max: ", " & ".join(map(str, np.round(np.max(np.array(analyzer.target_solutions), axis=0), 2))), "&", " & ".join(map(str, np.round(np.max(np.array(analyzer.optimized_solutions), axis=0), 2))))
print("std: ", " & ".join(map(str, np.round(np.std(np.array(analyzer.target_solutions), axis=0), 2))), "&", " & ".join(map(str, np.round(np.std(np.array(analyzer.optimized_solutions), axis=0), 2))))

### Optimization

In [None]:
ga_instance_0 = pygad.load("./archive/target_performance_similarity/09-17_15.37.29_WVCPSO_Schwefel/1_FA_Schwefel/meta_ga_obj")
ga_instance_0.plot_genes(solutions="best")
ga_instance_0.plot_genes(solutions="all")
ga_instance_0.plot_new_solution_rate()

print(ga_instance_0.best_solutions[-1])
print(ga_instance_0.best_solutions_fitness[-1])

In [None]:
use_test_setting = True

problem = OPTIMIZATION_PROBLEM

if use_test_setting:
    problem = Schwefel(dimension=20)
    algorithms = [
        #FireflyAlgorithm(population_size=POP_SIZE, alpha=0.15, beta0=0.4, gamma=0.04, theta=0.98),
        FireflyAlgorithm(population_size=POP_SIZE, alpha=0.84464886, beta0=0.74171366, gamma=0.60686203, theta=0.97758844),
        #FireflyAlgorithm(population_size=POP_SIZE, alpha=0.02, beta0=0.43, gamma=0.693, theta=0.962),
        #ParticleSwarmAlgorithm(population_size=POP_SIZE, c1=1.14, c2=0.05, w=0.54),
        #ParticleSwarmAlgorithm(population_size=POP_SIZE, c1=2.00417841, c2=0.70674774, w=0.82266951),
        #ParticleSwarmAlgorithm(population_size=POP_SIZE, c1=1.95, c2=0.82, w=0.82),
        #BatAlgorithm(population_size=POP_SIZE, loudness=1.0, pulse_rate=1.0, alpha=0.99, gamma=0.1)
        #tools.algorithms.pso.ParticleSwarmAlgorithm(population_size=POP_SIZE, c1=1.95, c2=0.82, w=0.82),
    ]

for algorithm in algorithms:
    optimization_runner(
        algorithm=algorithm,
        problem=problem,
        runs=1,
        dataset_path="./dataset",
        pop_diversity_metrics=POP_DIVERSITY_METRICS,
        indiv_diversity_metrics=INDIV_DIVERSITY_METRICS,
        max_evals=MAX_EVALS,
        run_index_seed=True,
        keep_pop_data=False,
        parallel_processing=True,
    )

### Population diversity metrics comparison

In [None]:
pop_metrics_list = [
    PopDiversityMetric.PDC,
    PopDiversityMetric.FDC,
    PopDiversityMetric.PFSD,
    PopDiversityMetric.PFM,
]

for algorithm in os.listdir(DATASET_PATH):
    for problem in os.listdir(os.path.join(DATASET_PATH, algorithm)):
        runs = os.listdir(os.path.join(DATASET_PATH, algorithm, problem))
        runs.sort()
        run_path = os.path.join(DATASET_PATH, algorithm, problem, runs[0])
        srd = SingleRunData.import_from_json(run_path)
        pop_metrics = SingleRunData.import_from_json(run_path).get_pop_diversity_metrics_values(metrics=pop_metrics_list, minmax_scale=False, standard_scale=True)
        ax = pop_metrics.plot(figsize=(15,7), fontsize=19, logy=False)
        ax.set_title(label=" ".join([f"Populacijske metrike raznolikosti - {algorithm}", problem]), fontdict={'fontsize':22}, pad=15)
        ax.set_xlabel(xlabel="Iteracija", fontdict={'fontsize':19}, labelpad=10)
        ax.set_ylabel(ylabel="Vrednost", fontdict={'fontsize':19}, labelpad=10)
        ax.legend(fontsize=15)

In [None]:
line_styles = ['-g', ':g', '--g', '-.g', '-b', ':b', '--b', '-.b']
_line_styles = ['-g', '-b', '-r', '-k', ':g', ':b', ':r', ':k']
style = {}
pop_metrics_list = [
    PopDiversityMetric.PDC,
    PopDiversityMetric.FDC,
    PopDiversityMetric.PFSD,
    PopDiversityMetric.PFM,
]
style_idx = 0
for algorithm in os.listdir(DATASET_PATH):
    if algorithm not in algorithms_to_plot:
        continue
    for idx, metric in enumerate(pop_metrics_list):
        if idx > 3:
            continue
        style['_'.join([algorithm, metric.value])] = line_styles[style_idx]
        style_idx += 1

metrics_by_problem = {}
for algorithm in os.listdir(DATASET_PATH):
    if algorithm not in algorithms_to_plot:
        continue
    for problem in os.listdir(os.path.join(DATASET_PATH, algorithm)):
        runs = os.listdir(os.path.join(DATASET_PATH, algorithm, problem))
        runs.sort()
        run_path = os.path.join(DATASET_PATH, algorithm, problem, runs[0])
        run = SingleRunData.import_from_json(run_path)
        pop_metrics = run.get_pop_diversity_metrics_values(metrics=pop_metrics_list, minmax_scale=False)
        
        for metric in pop_metrics_list:
            key = '_'.join([algorithm, metric.value])
            if metric.value not in pop_metrics:
                continue

            if problem in metrics_by_problem:
                metrics_by_problem[problem][key] = pop_metrics.get(metric.value).to_list()
            else:
                metric_values = {key: pop_metrics.get(metric.value).to_list()}
                metrics_by_problem[problem] = metric_values

            # scale fdc to [0, 1] for easier comparison on logy scale
            if metric == PopDiversityMetric.FDC:
                metrics_by_problem[problem][key] = sklearn.preprocessing.minmax_scale(
                    metrics_by_problem[problem][key], feature_range=(0, 1)
                )

for problem in metrics_by_problem:
    metrics = metrics_by_problem[problem]
    df_metrics = pd.DataFrame.from_dict(metrics)
    ax = df_metrics.plot(style=style, figsize=(25, 7), logy=True, fontsize=15)
    ax.legend(fontsize=15)
    ax.set_title(label=problem, fontdict={'fontsize':24})
    ax.set_xlabel(xlabel="Iterations", fontdict={'fontsize':20})

#### all subsets

In [None]:
dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

for idx in range(len(subsets)):
    subset = f"{idx}_subset"

    line_styles = ['-g', ':g', '--g', '-.g', '-b', ':b', '--b', '-.b']
    _line_styles = ['-g', '-b', '-r', '-k', ':g', ':b', ':r', ':k']
    style = {}
    pop_metrics_list = [
        PopDiversityMetric.PDC,
        PopDiversityMetric.FDC,
        PopDiversityMetric.PFSD,
        PopDiversityMetric.PFM,
    ]
    style_idx = 0
    for algorithm in os.listdir(os.path.join(dataset_path, subset)):
        if algorithm not in algorithms_to_plot:
            continue
        for idx, metric in enumerate(pop_metrics_list):
            if idx > 3:
                continue
            style['_'.join([algorithm, metric.value])] = line_styles[style_idx]
            style_idx += 1

    metrics_by_problem = {}
    for algorithm in os.listdir(os.path.join(dataset_path, subset)):
        if algorithm not in algorithms_to_plot:
            continue
        for problem in os.listdir(os.path.join(dataset_path, subset, algorithm)):
            runs = os.listdir(os.path.join(dataset_path, subset, algorithm, problem))
            runs.sort()
            run_path = os.path.join(dataset_path, subset, algorithm, problem, runs[0])
            run = SingleRunData.import_from_json(run_path)
            pop_metrics = run.get_pop_diversity_metrics_values(metrics=pop_metrics_list, minmax_scale=False)
            
            for metric in pop_metrics_list:
                key = '_'.join([algorithm, metric.value])
                if metric.value not in pop_metrics:
                    continue

                if problem in metrics_by_problem:
                    metrics_by_problem[problem][key] = pop_metrics.get(metric.value).to_list()
                else:
                    metric_values = {key: pop_metrics.get(metric.value).to_list()}
                    metrics_by_problem[problem] = metric_values

                # scale fdc to [0, 1] for easier comparison on logy scale
                if metric == PopDiversityMetric.FDC:
                    metrics_by_problem[problem][key] = sklearn.preprocessing.minmax_scale(
                        metrics_by_problem[problem][key], feature_range=(0, 1)
                    )

    for problem in metrics_by_problem:
        metrics = metrics_by_problem[problem]
        df_metrics = pd.DataFrame.from_dict(metrics)
        ax = df_metrics.plot(style=style, figsize=(25, 7), logy=True, fontsize=15)
        ax.legend(fontsize=15)
        ax.set_title(label=subset, fontdict={'fontsize':24})
        ax.set_xlabel(xlabel="Iterations", fontdict={'fontsize':20})

### Average population diversity metrics for all runs by subset

In [None]:
dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

for idx in range(len(subsets)):
    subset = f"{idx}_subset"

    line_styles = ['-g', ':g', '--g', '-.g', '-b', ':b', '--b', '-.b']
    _line_styles = ['-g', '-b', '-r', '-k', ':g', ':b', ':r', ':k']
    style = {}
    pop_metrics_list = [
        PopDiversityMetric.PDC,
        PopDiversityMetric.FDC,
        PopDiversityMetric.PFSD,
        PopDiversityMetric.PFM,
    ]
    style_idx = 0
    for algorithm in os.listdir(os.path.join(dataset_path, subset)):
        if algorithm not in algorithms_to_plot:
            continue
        for idx, metric in enumerate(pop_metrics_list):
            if idx > 3:
                continue
            style['_'.join([algorithm, metric.value])] = line_styles[style_idx]
            style_idx += 1

    metrics_by_problem = {}
    for algorithm in os.listdir(os.path.join(dataset_path, subset)):
        if algorithm not in algorithms_to_plot:
            continue
        for problem in os.listdir(os.path.join(dataset_path, subset, algorithm)):
            runs = os.listdir(os.path.join(dataset_path, subset, algorithm, problem))
            runs.sort()
            for run_file_name in runs:
                run_path = os.path.join(dataset_path, subset, algorithm, problem, run_file_name)
                run = SingleRunData.import_from_json(run_path)
                pop_metrics = run.get_pop_diversity_metrics_values(metrics=pop_metrics_list, minmax_scale=False)
                
                for metric in pop_metrics_list:
                    key = '_'.join([algorithm, metric.value])
                    if metric.value not in pop_metrics:
                        continue

                    if problem in metrics_by_problem:
                        if key in metrics_by_problem[problem]:
                            sum_of_metrics = np.add(
                                metrics_by_problem[problem][key], pop_metrics.get(metric.value).to_numpy() / len(runs)
                            )
                            metrics_by_problem[problem][key] = sum_of_metrics
                        else:
                            metrics_by_problem[problem][key] = pop_metrics.get(metric.value).to_numpy() / len(runs)
                    else:
                        metric_values = {key: pop_metrics.get(metric.value).to_numpy() / len(runs)}
                        metrics_by_problem[problem] = metric_values

                    # scale fdc to [0, 1] for easier comparison on logy scale
                    """
                    if metric == PopDiversityMetric.FDC:
                        metrics_by_problem[problem][key] = sklearn.preprocessing.minmax_scale(
                            metrics_by_problem[problem][key], feature_range=(0, 1)
                        )
                    """

    for problem in metrics_by_problem:
        metrics = metrics_by_problem[problem]
        df_metrics = pd.DataFrame.from_dict(metrics)
        ax = df_metrics.plot(style=style, figsize=(25, 7), logy=True, fontsize=15)
        ax.legend(fontsize=15)
        ax.set_title(label=subset, fontdict={'fontsize':24})
        ax.set_xlabel(xlabel="Iterations", fontdict={'fontsize':20})

### Individual diversity metrics comparison

In [None]:
metrics_by_problem = {}
for algorithm in os.listdir(DATASET_PATH):
    if algorithm not in ["WVCPSO"]:#algorithms_to_plot:
        continue
    for problem in os.listdir(os.path.join(DATASET_PATH, algorithm)):
        runs = os.listdir(os.path.join(DATASET_PATH, algorithm, problem))
        runs.sort()
        run_path = os.path.join(DATASET_PATH, algorithm, problem, runs[0])
        run = SingleRunData.import_from_json(run_path)
        indiv_metrics = run.get_indiv_diversity_metrics_values(minmax_scale=False, standard_scale=True)
        for metric in INDIV_DIVERSITY_METRICS:
            key = '_'.join([algorithm, metric.value])
            if problem in metrics_by_problem:
                metrics_by_problem[problem][key] = indiv_metrics.get(metric.value).to_list()
            else:
                metric_values = {key: indiv_metrics.get(metric.value).to_list()}
                metrics_by_problem[problem] = metric_values
        

for problem in metrics_by_problem:
    metrics = metrics_by_problem[problem]
    df_metrics = pd.DataFrame.from_dict(metrics)

    fig, axes = plt.subplots(1, len(INDIV_DIVERSITY_METRICS))
    fig.subplots_adjust(wspace=0.7, top=0.825, bottom=0)
    #fig.suptitle(problem, fontsize=20)
    fig.suptitle("Individualne metrike raznolikosti - Schwefel", fontsize=22)

    for idx, metric in enumerate(INDIV_DIVERSITY_METRICS):
        df_metric = df_metrics.filter(regex=metric.value)
        df_metric.columns = df_metric.columns.str.replace("WVCPSO" + '_'+metric.value, 'PSO')
        ax = df_metric.plot(ax=axes[idx], kind="box", figsize=(15, 5), logy=False, fontsize=19)
        ax.margins(x=0)
        ax.set_title(label=metric.name, fontdict={'fontsize':19}, pad=10)

In [None]:
dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

for idx in range(len(subsets)):
    subset = f"{idx}_subset"

    metrics_by_problem = {}
    for algorithm in os.listdir(os.path.join(dataset_path, subset)):
        if algorithm not in algorithms_to_plot:
            continue
        for problem in os.listdir(os.path.join(dataset_path, subset, algorithm)):
            runs = os.listdir(os.path.join(dataset_path, subset, algorithm, problem))
            runs.sort()
            run_path = os.path.join(dataset_path, subset, algorithm, problem, runs[0])
            run = SingleRunData.import_from_json(run_path)
            indiv_metrics = run.get_indiv_diversity_metrics_values(minmax_scale=False, standard_scale=True)
            for metric in INDIV_DIVERSITY_METRICS:
                key = '_'.join([algorithm, metric.value])
                if problem in metrics_by_problem:
                    metrics_by_problem[problem][key] = indiv_metrics.get(metric.value).to_list()
                else:
                    metric_values = {key: indiv_metrics.get(metric.value).to_list()}
                    metrics_by_problem[problem] = metric_values
            

    for problem in metrics_by_problem:
        metrics = metrics_by_problem[problem]
        df_metrics = pd.DataFrame.from_dict(metrics)

        fig, axes = plt.subplots(1, len(INDIV_DIVERSITY_METRICS))
        fig.suptitle(f"{problem} {subset}", fontsize=23)
        fig.subplots_adjust(wspace=0.4)

        for idx, metric in enumerate(INDIV_DIVERSITY_METRICS):
            df_metric = df_metrics.filter(regex=metric.value)
            df_metric.columns = df_metric.columns.str.replace('_'+metric.value, '')
            ax = df_metric.plot(ax=axes[idx], kind="box", figsize=(25, 6), logy=False, fontsize=15)
            ax.set_title(label=metric.name, fontdict={'fontsize':20})

### Average of individual diversity metrics for all runs by subset

In [None]:
dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

for idx in range(len(subsets)):
    subset = f"{idx}_subset"

    metrics_by_problem = {}
    for algorithm in os.listdir(os.path.join(dataset_path, subset)):
        if algorithm not in algorithms_to_plot:
            continue
        for problem in os.listdir(os.path.join(dataset_path, subset, algorithm)):
            runs = os.listdir(os.path.join(dataset_path, subset, algorithm, problem))
            runs.sort()
            for run_file_name in runs:
                run_path = os.path.join(dataset_path, subset, algorithm, problem, runs[0])
                run = SingleRunData.import_from_json(run_path)
                indiv_metrics = run.get_indiv_diversity_metrics_values(minmax_scale=False)
                for metric in INDIV_DIVERSITY_METRICS:
                    key = '_'.join([algorithm, metric.value])
                    if problem in metrics_by_problem:
                        if key in metrics_by_problem[problem]:
                            sum_of_metrics = np.add(
                                metrics_by_problem[problem][key], indiv_metrics.get(metric.value).to_numpy() / len(runs)
                            )
                            metrics_by_problem[problem][key] = sum_of_metrics
                        else:    
                            metrics_by_problem[problem][key] = indiv_metrics.get(metric.value).to_numpy() / len(runs)
                    else:
                        metric_values = {key: indiv_metrics.get(metric.value).to_numpy() / len(runs)}
                        metrics_by_problem[problem] = metric_values
            

    for problem in metrics_by_problem:
        metrics = metrics_by_problem[problem]
        df_metrics = pd.DataFrame.from_dict(metrics)

        fig, axes = plt.subplots(1, len(INDIV_DIVERSITY_METRICS))
        fig.suptitle(f"{problem} {subset}", fontsize=23)
        fig.subplots_adjust(wspace=0.4)

        for idx, metric in enumerate(INDIV_DIVERSITY_METRICS):
            df_metric = df_metrics.filter(regex=metric.value)
            df_metric.columns = df_metric.columns.str.replace('_'+metric.value, '')
            ax = df_metric.plot(ax=axes[idx], kind="box", figsize=(25, 6), logy=True, fontsize=15)
            ax.set_title(label=metric.name, fontdict={'fontsize':20})

### 1-SMAPE & feature vector comparison (normalized and pairwise normalized)

In [None]:
dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

for idx in range(len(subsets)):
    subset = f"{idx}_subset"
    #------------------------------------------------------------------------------------
    #   Diversity metrics 1-SMAPE
    #------------------------------------------------------------------------------------
    al1 = "WVCPSO"
    al2 = "FA"
    problem = "Schwefel"


    first_runs = os.listdir(os.path.join(dataset_path, subset, al1, problem))
    second_runs = os.listdir(os.path.join(dataset_path, subset, al2, problem))

    first_runs.sort()
    second_runs.sort()

    mean_similarities = []
    max_similarities = []
    mean_similarity_values = []
    max_similarity_values = []

    for fr, sr in zip(first_runs, second_runs):
        first_run_path = os.path.join(dataset_path, subset, al1, problem, fr)
        second_run_path = os.path.join(dataset_path, subset, al2, problem, sr)
        f_srd = SingleRunData.import_from_json(first_run_path)
        s_srd = SingleRunData.import_from_json(second_run_path)
        
        similarity = f_srd.get_diversity_metrics_similarity(s_srd, get_raw_values=True)
        mean_similarity_values.append(np.mean(similarity))
        mean_similarities.append(np.mean(mean_similarity_values))
        max_similarity_values.append(np.max(similarity))
        max_similarities.append(np.mean(max_similarity_values))

    fig, ax, = plt.subplots(1, 3, figsize=(25, 5))
    ax[0].plot(np.arange(0, len(mean_similarities)), mean_similarities)


    #------------------------------------------------------------------------------------
    #   Pairwise feature vector comparison
    #------------------------------------------------------------------------------------

    feature_vectors_1 = []
    feature_vectors_2 = []
    for idx, algorithm in enumerate(os.listdir(os.path.join(dataset_path, subset))):
        for problem in os.listdir(os.path.join(dataset_path, subset, algorithm)):
            runs = os.listdir(os.path.join(dataset_path, subset, algorithm, problem))
            runs.sort()
            for run in runs:
                run_path = os.path.join(dataset_path, subset, algorithm, problem, run)
                srd = SingleRunData.import_from_json(run_path)
            
                feature_vector = srd.get_feature_vector()

                if idx == 0:
                    feature_vectors_1.append(feature_vector)
                else:
                    feature_vectors_2.append(feature_vector)

    # similarity plot for variable number of vectors
    # cosine
    mean_similarities = []
    for i in range(2, len(feature_vectors_1)):
        mean_vector_1 = np.mean(feature_vectors_1[:i], axis=0)
        mean_vector_2 = np.mean(feature_vectors_2[:i], axis=0)
        mean_similarities.append(1 - spatial.distance.cosine(mean_vector_1, mean_vector_2))

    mean_pairwise_similarities = []
    for i in range(2, len(feature_vectors_1)):
        similarities = []
        for feature_vector1, feature_vector2 in zip(feature_vectors_1[:i], feature_vectors_2[:i]):
            similarities.append(1 - spatial.distance.cosine(feature_vector1, feature_vector2))

        mean_pairwise_similarities.append(np.mean(similarities))

    # Spearman
    mean_spearman = []
    for i in range(2, len(feature_vectors_1)):
        mean_vector_1 = np.mean(feature_vectors_1[:i], axis=0)
        mean_vector_2 = np.mean(feature_vectors_2[:i], axis=0)
        r, p = stats.spearmanr(mean_vector_1, mean_vector_2)
        mean_spearman.append(r)

    mean_pairwise_spearman = []
    for i in range(2, len(feature_vectors_1)):
        spearman_values = []
        for feature_vector1, feature_vector2 in zip(feature_vectors_1[:i], feature_vectors_2[:i]):
            r, p = stats.spearmanr(feature_vector1, feature_vector2)
            spearman_values.append(r)

        mean_pairwise_spearman.append(np.mean(spearman_values))

    ax[1].plot(np.arange(0, len(mean_pairwise_similarities)), mean_pairwise_similarities, label="pairwise cosine")
    ax[1].plot(np.arange(0, len(mean_similarities)), mean_similarities, label="mean cosine")
    ax[2].plot(np.arange(0, len(mean_pairwise_spearman)), mean_pairwise_spearman, label="pairwise Spearman")
    ax[2].plot(np.arange(0, len(mean_spearman)), mean_spearman, label="mean Spearman")
    ax[1].legend()
    ax[2].legend()

    fig.suptitle(subset)
    plt.show()


### 1-SMAPE values plot

In [None]:
dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

mean_similarity_values = []
for idx in range(len(subsets)):
    subset = f"{idx}_subset"
    al1 = "WVCPSO"
    al2 = "FA"
    problem = "Schwefel"

    first_runs = os.listdir(os.path.join(dataset_path, subset, al1, problem))
    second_runs = os.listdir(os.path.join(dataset_path, subset, al2, problem))

    first_runs.sort()
    second_runs.sort()

    sum = 0.0
    similarity_values = []
    mean_similarity = []
    for fr, sr in zip(first_runs, second_runs):
        first_run_path = os.path.join(dataset_path, subset, al1, problem, fr)
        second_run_path = os.path.join(dataset_path, subset, al2, problem, sr)
        f_srd = SingleRunData.import_from_json(first_run_path)
        s_srd = SingleRunData.import_from_json(second_run_path)
        
        similarity = f_srd.get_diversity_metrics_similarity(s_srd, get_raw_values=True)
        similarity_values.append(similarity)

    # x axis labels
    labels = []
    for metric in POP_DIVERSITY_METRICS:
        labels.append(metric.value)
    for metric in INDIV_DIVERSITY_METRICS:
        labels.append(metric.value)

    similarity_values = np.array(similarity_values)
    plt.figure(figsize=(15, 5))
    plt.violinplot(similarity_values, showmeans=True, showextrema=True, showmedians=True)
    plt.xticks(ticks=np.arange(1, len(labels) + 1), labels=labels, fontsize=19)
    plt.yticks(fontsize=19)
    plt.ylabel("vrednost", fontsize=19, labelpad=10)
    plt.xlabel("metrika", fontsize=19, labelpad=10)
    plt.title(f"primerjava algoritmov PSO in FA po metriki 1-SMAPE subset {subset}", fontsize=22, pad=15)
    plt.show()

### Feature vectors comparison

#### Single subset

In [None]:
al1 = "WVCPSO"
al2 = "FA"
problem = "Schwefel"

feature_vectors_1 = []
feature_vectors_2 = []

first_runs = os.listdir(os.path.join(DATASET_PATH, al1, problem))
second_runs = os.listdir(os.path.join(DATASET_PATH, al2, problem))

first_runs.sort()
second_runs.sort()

first_srd_list = []
second_srd_list = []

sum = 0.0
similarities = []
for fr, sr in zip(first_runs, second_runs):
    first_run_path = os.path.join(DATASET_PATH, al1, problem, fr)
    second_run_path = os.path.join(DATASET_PATH, al2, problem, sr)
    f_srd = SingleRunData.import_from_json(first_run_path)
    s_srd = SingleRunData.import_from_json(second_run_path)
    fv1 = f_srd.get_feature_vector(standard_scale=True, minmax_scale=False)
    fv2 = s_srd.get_feature_vector(standard_scale=True, minmax_scale=False)

    feature_vectors_1.append(fv1)
    feature_vectors_2.append(fv2)
    #first_srd_list.append(f_srd)
    #second_srd_list.append(s_srd)

#feature_vectors_1, feature_vectors_2 = SingleRunData.get_feature_vectors_global_diversity_metrics_scaling(first_srd_list=first_srd_list, second_srd_list=second_srd_list)

# similarity plot for variable number of vectors
mean_similarities = []
for i in range(2, len(feature_vectors_1)):
    mean_vector_1 = np.mean(feature_vectors_1[:i], axis=0)
    mean_vector_2 = np.mean(feature_vectors_2[:i], axis=0)
    mean_similarities.append(1 - spatial.distance.cosine(mean_vector_1, mean_vector_2))

mean_pairwise_similarities = []
for i in range(2, len(feature_vectors_1)):
    similarities = []
    euclidean_distance = []
    for feature_vector1, feature_vector2 in zip(feature_vectors_1[:i], feature_vectors_2[:i]):
        similarities.append(1 - spatial.distance.cosine(feature_vector1, feature_vector2))

    mean_pairwise_similarities.append(np.mean(similarities))

plt.plot(np.arange(0, len(mean_pairwise_similarities)), mean_pairwise_similarities)
plt.plot(np.arange(0, len(mean_similarities)), mean_similarities)
plt.show()

# plot mean vectors for visual comparison
mean_vector_1 = np.mean(feature_vectors_1, axis=0)
mean_vector_2 = np.mean(feature_vectors_2, axis=0)

r, p = stats.spearmanr(mean_vector_1, mean_vector_2)
print("cosine similarity of average vectors: ", 1 - spatial.distance.cosine(mean_vector_1, mean_vector_2), "\n")
print(f"person test for average vectors: r = {r}, p = {p}")

index = np.arange(len(mean_vector_1))
bar_width = 0.4
fig = plt.figure(figsize=(15, 5))
ax = plt.gca()
plt.bar(index, mean_vector_1, bar_width, label="PSO")
plt.bar(index + bar_width, mean_vector_2, bar_width, label=al2)
plt.xlabel("značilnica", fontsize=19, labelpad=15)
plt.ylabel("vrednost", fontsize=19)
plt.xticks(index + bar_width / 2, index, fontsize=19, rotation=45)
plt.yticks(fontsize=19)
plt.xlim(ax.patches[0].get_x() * 2.25, ax.patches[-1].get_x() + ax.patches[-1].get_width() * 1.5)
plt.grid(axis = "y", color = 'gray', linestyle = '--', linewidth = 0.5)
ax.set_axisbelow(True)
plt.title(f"Primerjava povprečnih vektorjev značilnic PSO - FA", fontsize=22, pad=15)
plt.show()

scaler = StandardScaler()
mean_vector_1 = scaler.fit_transform(mean_vector_1.reshape((-1,1))).reshape((-1))
mean_vector_2 = scaler.transform(mean_vector_2.reshape((-1,1))).reshape((-1))

index = np.arange(len(mean_vector_1))
bar_width = 0.4
fig = plt.figure(figsize=(15, 5))
ax = plt.gca()
plt.bar(index, mean_vector_1, bar_width, label="PSO")
plt.bar(index + bar_width, mean_vector_2, bar_width, label=al2)
plt.xlabel("značilnica", fontsize=19, labelpad=15)
plt.ylabel("vrednost", fontsize=19)
plt.xticks(index + bar_width / 2, index, fontsize=19, rotation=45)
plt.yticks(fontsize=19)
plt.xlim(ax.patches[0].get_x() * 2.25, ax.patches[-1].get_x() + ax.patches[-1].get_width() * 1.5)
plt.grid(axis = "y", color = 'gray', linestyle = '--', linewidth = 0.5)
ax.set_axisbelow(True)
plt.title(f"Primerjava povprečnih vektorjev značilnic PSO - FA", fontsize=22, pad=15)
plt.legend(prop={'size': 15})
plt.show()

similarities = []
for feature_vector1, feature_vector2 in zip(feature_vectors_1, feature_vectors_2):
    #print(1 - spatial.distance.cosine(feature_vector1, feature_vector2))
    similarities.append(1 - spatial.distance.cosine(feature_vector1, feature_vector2))

print("pairwise similarity",np.mean(similarities))

for vector_idx in range(4):
    plt.plot(np.arange(0, len(feature_vectors_1[vector_idx])), feature_vectors_1[vector_idx])
    plt.plot(np.arange(0, len(feature_vectors_2[vector_idx])), feature_vectors_2[vector_idx])
    plt.show()

### FDC comparison

In [None]:
fdc_values = {}
for algorithm in os.listdir(DATASET_PATH):
    if algorithm not in algorithms_to_plot:
        continue
    for problem in os.listdir(os.path.join(DATASET_PATH, algorithm)):
        runs = os.listdir(os.path.join(DATASET_PATH, algorithm, problem))
        runs.sort()
        run_path = os.path.join(DATASET_PATH, algorithm, problem, runs[0])
        run = SingleRunData.import_from_json(run_path)
        metrics = run.get_pop_diversity_metrics_values(metrics=[PopDiversityMetric.FDC], minmax_scale=False)
        if len(metrics) == 0:
            continue
        fdc = metrics.get("FDC")
        
        if problem in fdc_values:
            fdc_values[problem][algorithm] = fdc
        else:
            fdc_dict = {algorithm: fdc}
            fdc_values[problem] = fdc_dict

for problem in fdc_values:
    fdc_dict = fdc_values[problem]
    for key in fdc_dict:
        convergence = fdc_dict[key]
        fdc_dict[key] = convergence

    fdc_dict = pd.DataFrame.from_dict(fdc_dict)
    ax = fdc_dict.plot(title=problem, figsize=(25, 7), logy=False, fontsize=15)
    ax.legend(fontsize=15)
    ax.set_title(label=problem, fontdict={'fontsize':24})
    ax.set_xlabel(xlabel="Iterations", fontdict={'fontsize':20})

### Best fitness value convergence comparison

In [None]:
convergences = {}
for algorithm in os.listdir(DATASET_PATH):
    if algorithm not in algorithms_to_plot:
        continue
    for problem in os.listdir(os.path.join(DATASET_PATH, algorithm)):
        runs = os.listdir(os.path.join(DATASET_PATH, algorithm, problem))
        runs.sort()
        run_path = os.path.join(DATASET_PATH, algorithm, problem, runs[0])
        run = SingleRunData.import_from_json(run_path)
        print(f"best fitness {algorithm} - {problem}: {run.best_fitness}")
        convergence = run.get_best_fitness_values(normalize=False)
        
        if problem in convergences:
            convergences[problem][algorithm] = convergence
        else:
            convergence_dict = {algorithm: convergence}
            convergences[problem] = convergence_dict

for problem in convergences:
    convergence_dict = convergences[problem]
    for key in convergence_dict:
        convergence = convergence_dict[key]
        convergence_dict[key] = convergence

    convergence_dict = pd.DataFrame.from_dict(convergence_dict)
    ax = convergence_dict.plot(title=problem, figsize=(25, 7), logy=False, fontsize=15)
    ax.legend(fontsize=15)
    ax.set_title(label=problem, fontdict={'fontsize':24})
    ax.set_xlabel(xlabel="Iterations", fontdict={'fontsize':20})

### NN training and test

### LSTM model

In [None]:
train_data_loader, val_data_loader, test_data_loader, actual_labels = get_data_loaders(
    dataset_path=DATASET_PATH,
    batch_size=BATCH_SIZE,
    val_size=VAL_SIZE,
    test_size=TEST_SIZE,
    n_pca_components=N_PCA_COMPONENTS,
    problems=[OPTIMIZATION_PROBLEM.name()],
    dataset_subsets=False,
    random_state=RNG_SEED
)

pop_features, indiv_features, target = next(iter(train_data_loader))
lstm_model = LSTMClassifier(
    input_dim=np.shape(pop_features)[2],
    aux_input_dim=np.shape(indiv_features)[1],
    num_labels=len(actual_labels),
    hidden_dim=LSTM_HIDDEN_DIM,
    num_layers=LSTM_NUM_LAYERS,
    dropout=LSTM_DROPOUT
)

optimizer = torch.optim.Adam(lstm_model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()
lstm_model_filename = f"./lstm_model.pt"

if execute_training:
    lstm_model.to(device)
    nn_train(
        model=lstm_model,
        train_data_loader=train_data_loader,
        val_data_loader=val_data_loader,
        epochs=EPOCHS,
        loss_fn=loss_fn,
        optimizer=optimizer,
        device=device,
        model_filename=lstm_model_filename,
        verbal=True)
else:
    lstm_model = torch.load(lstm_model_filename, map_location=torch.device(device))
    lstm_model.to(device)
    if os.path.exists('loss_plot.png'):
        loss_plot = np.asarray(Image.open('loss_plot.png'))
        plt.axis("off")
        plt.imshow(loss_plot)

In [None]:
nn_test(
    model=lstm_model,
    test_data_loader=test_data_loader,
    device=device,
    labels=actual_labels,
    show_classification_report=True
)

### Linear model

In [None]:
train_data_loader, val_data_loader, test_data_loader, actual_labels = get_data_loaders(
    dataset_path=DATASET_PATH,
    batch_size=100,
    val_size=VAL_SIZE,
    test_size=TEST_SIZE,
    n_pca_components=N_PCA_COMPONENTS,
    problems=[OPTIMIZATION_PROBLEM.name()],
    dataset_subsets=True,
    nn_type=NNType.LINEAR,
    random_state=RNG_SEED
)

feature_vector, target = next(iter(train_data_loader))

linear_model = LinearClassifier(
    input_dim=np.shape(feature_vector)[1],
    num_labels=len(actual_labels),
)

optimizer = torch.optim.Adam(linear_model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()
linear_model_filename = f"./linear_model.pt"

if execute_training:
    linear_model.to(device)
    nn_train(
        model=linear_model,
        train_data_loader=train_data_loader,
        val_data_loader=val_data_loader,
        epochs=200,
        loss_fn=loss_fn,
        optimizer=optimizer,
        device=device,
        model_filename=linear_model_filename,
        nn_type=NNType.LINEAR,
        verbal=True)
else:
    linear_model = torch.load(linear_model_filename, map_location=torch.device(device))
    linear_model.to(device)
    if os.path.exists('loss_plot.png'):
        loss_plot = np.asarray(Image.open('loss_plot.png'))
        plt.axis("off")
        plt.imshow(loss_plot)


In [None]:
nn_test(
    model=linear_model,
    test_data_loader=test_data_loader,
    device=device,
    nn_type=NNType.LINEAR,
    labels=actual_labels,
    show_classification_report=True
)

### Similarity metrics calculator

In [None]:
with open('standardized_ml/fa_pso_random.json', 'r') as file:
    ml_accuracy = json.load(file)

dataset_path = _DATASET_PATH

subsets = os.listdir(dataset_path)

mean_similarity_values = []
mean_cosine_similarity_values = []
mean_spearman_r = []

for idx in range(len(subsets)):
    subset = f"{idx}_subset"
    al1 = "FA"
    al2 = "WVCPSO"
    problem = "Schwefel"

    first_runs = os.listdir(os.path.join(dataset_path, subset, al1, problem))
    second_runs = os.listdir(os.path.join(dataset_path, subset, al2, problem))

    first_runs.sort()
    second_runs.sort()
    # *******************************************************************************
    # 1-SMAPE
    # *******************************************************************************
    similarity_values = []
    mean_similarity = []
    for fr, sr in zip(first_runs, second_runs):
        first_run_path = os.path.join(dataset_path, subset, al1, problem, fr)
        second_run_path = os.path.join(dataset_path, subset, al2, problem, sr)
        f_srd = SingleRunData.import_from_json(first_run_path)
        s_srd = SingleRunData.import_from_json(second_run_path)
        
        similarity = f_srd.get_diversity_metrics_similarity(s_srd, get_raw_values=True)
        mean_similarity.append(round(np.mean(similarity), 2))
        similarity_values.append(similarity)

    mean_similarity_values.append(round(np.mean(mean_similarity), 2))

    # *******************************************************************************
    # Cosine and Spearman
    # *******************************************************************************

    feature_vectors_1 = []
    feature_vectors_2 = []
    for idx, algorithm in enumerate(os.listdir(os.path.join(dataset_path, subset))):
        for problem in os.listdir(os.path.join(dataset_path, subset, algorithm)):
            runs = os.listdir(os.path.join(dataset_path, subset, algorithm, problem))
            runs.sort()
            for run in runs:
                run_path = os.path.join(dataset_path, subset, algorithm, problem, run)
                srd = SingleRunData.import_from_json(run_path)
            
                feature_vector = srd.get_feature_vector(standard_scale=True)

                if idx == 0:
                    feature_vectors_1.append(feature_vector)
                else:
                    feature_vectors_2.append(feature_vector)

    mean_vector_1 = np.mean(feature_vectors_1, axis=0)
    mean_vector_2 = np.mean(feature_vectors_2, axis=0)

    mean_cosine_similarity_values.append(1 - spatial.distance.cosine(mean_vector_1, mean_vector_2))
    r, p = stats.spearmanr(mean_vector_1, mean_vector_2)
    mean_spearman_r.append(r)

for similarity, cosine, r, svm_test, knn_test in zip(mean_similarity_values, mean_cosine_similarity_values, mean_spearman_r, ml_accuracy["svm_test"], ml_accuracy["knn_test"]):
    print(f"{round(similarity, 2)} & {round(cosine, 2)} & {round(r, 2)} & {round(svm_test, 2)} & {round(knn_test, 2)}")

print("")

for similarity, cosine, r, svm_test, knn_test in zip(mean_similarity_values, mean_cosine_similarity_values, mean_spearman_r, ml_accuracy["svm_test"], ml_accuracy["knn_test"]):
    output = ""
    if round(similarity, 2) == round(np.max(mean_similarity_values), 2):
        output += "\\textbf{" + f"{round(similarity, 2)}" + "} & "
    else:
        output += f"{round(similarity, 2)} & "

    if round(cosine, 2) == round(np.max(mean_cosine_similarity_values), 2):
        output += "\\textbf{" + f"{round(cosine, 2)}" + "} & "
    else:
        output += f"{round(cosine, 2)} & "

    if round(r, 2) == round(np.max(mean_spearman_r), 2):
        output += "\\textbf{" + f"{round(r, 2)}" + "} & "
    else:
        output += f"{round(r, 2)} & "

    if round(svm_test, 2) == round(np.max(ml_accuracy["svm_test"]), 2):
        output += "\\textbf{" + f"{round(svm_test, 2)}" + "} & "
    else:
        output += f"{round(svm_test, 2)} & "       

    if round(knn_test, 2) == round(np.max(ml_accuracy["knn_test"]), 2):
        output += "\\textbf{" + f"{round(knn_test, 2)}" + "}"
    else:
        output += f"{round(knn_test, 2)}"
    print(output)


print("")
print(f" & {round(np.min(mean_similarity_values), 2)} & {round(np.min(mean_cosine_similarity_values), 2)} & {round(np.min(mean_spearman_r), 2)} & {round(np.min(ml_accuracy['svm_test']), 2)}  & {round(np.min(ml_accuracy['knn_test']), 2)} ")
print(f" & {round(np.mean(mean_similarity_values), 2)} & {round(np.mean(mean_cosine_similarity_values), 2)} & {round(np.mean(mean_spearman_r), 2)} & {round(np.mean(ml_accuracy['svm_test']), 2)} & {round(np.mean(ml_accuracy['knn_test']), 2)}")
print(f" & {round(np.max(mean_similarity_values), 2)} & {round(np.max(mean_cosine_similarity_values), 2)} & {round(np.max(mean_spearman_r), 2)} & {round(np.max(ml_accuracy['svm_test']), 2)}  & {round(np.max(ml_accuracy['knn_test']), 2)} ")
print(f" & {round(np.std(mean_similarity_values), 2)} & {round(np.std(mean_cosine_similarity_values), 2)} & {round(np.std(mean_spearman_r), 2)} & {round(np.std(ml_accuracy['svm_test']), 2)}  & {round(np.std(ml_accuracy['knn_test']), 2)} ")

print(f"{round(np.min(mean_similarity_values), 2)} ")
print(f"{round(np.mean(mean_similarity_values), 2)}")
print(f"{round(np.max(mean_similarity_values), 2)} ")
print(f"{round(np.std(mean_similarity_values), 2)} ")

print(f"{round(np.min(mean_cosine_similarity_values), 2)} ")
print(f"{round(np.mean(mean_cosine_similarity_values), 2)}")
print(f"{round(np.max(mean_cosine_similarity_values), 2)} ")
print(f"{round(np.std(mean_cosine_similarity_values), 2)} ")

print(f"{round(np.min(mean_spearman_r), 2)} ")
print(f"{round(np.mean(mean_spearman_r), 2)}")
print(f"{round(np.max(mean_spearman_r), 2)} ")
print(f"{round(np.std(mean_spearman_r), 2)} ")

print(f"{round(np.min(ml_accuracy['svm_test']), 2)} ")
print(f"{round(np.mean(ml_accuracy['svm_test']), 2)}")
print(f"{round(np.max(ml_accuracy['svm_test']), 2)} ")
print(f"{round(np.std(ml_accuracy['svm_test']), 2)} ")

print(f"{round(np.min(ml_accuracy['knn_test']), 2)} ")
print(f"{round(np.mean(ml_accuracy['knn_test']), 2)}")
print(f"{round(np.max(ml_accuracy['knn_test']), 2)} ")
print(f"{round(np.std(ml_accuracy['knn_test']), 2)} ")

index = np.arange(len(mean_similarity_values))
bar_width = 0.2
fig = plt.figure(figsize=(25, 7))
ax = plt.gca()
plt.bar(index, mean_similarity_values, width=bar_width, label="1-SMAPE")
plt.bar(index+bar_width, mean_cosine_similarity_values, width=bar_width, label="cosim")
plt.bar(index+(bar_width*2), mean_spearman_r, width=bar_width, label="rho")
plt.legend(prop={'size': 15})
plt.xticks(index + bar_width*2 / 2, index, fontsize=19, rotation=45)
plt.xlim(ax.patches[0].get_x() * 2.25, ax.patches[-1].get_x() + ax.patches[-1].get_width() * 1.5)
plt.yticks(fontsize=19)
plt.show()

In [None]:
sorting_idx = np.argsort(mean_similarity_values)
tmp_mean_similarity_values = np.array(mean_similarity_values)[sorting_idx]
tmp_mean_cosine_similarity_values = np.array(mean_cosine_similarity_values)[sorting_idx]
tmp_mean_spearman_r = np.array(mean_spearman_r)[sorting_idx]

index = np.arange(len(mean_similarity_values))
bar_width = 0.2
fig = plt.figure(figsize=(25, 7))
ax = plt.gca()
plt.bar(index, tmp_mean_similarity_values, width=bar_width, label="1-SMAPE")
plt.bar(index+bar_width, tmp_mean_cosine_similarity_values, width=bar_width, label="cosim")
plt.bar(index+(bar_width*2), tmp_mean_spearman_r, width=bar_width, label="rho")
plt.legend(prop={'size': 15})
plt.xticks(index + bar_width*2 / 2, index, fontsize=19, rotation=45)
plt.xlim(ax.patches[0].get_x() * 2.25, ax.patches[-1].get_x() + ax.patches[-1].get_width() * 1.5)
plt.yticks(fontsize=19)
plt.show()

### Similarity metrics correlation

In [None]:
corr_data_labels = ["1-SMAPE", "cosim", "rho", "SVM test", "KNN test"]

corr_data = np.array([
    mean_similarity_values, 
    mean_cosine_similarity_values, 
    mean_spearman_r,
    np.array(ml_accuracy["svm_test"]),
    np.array(ml_accuracy["knn_test"])
])

corr_data = np.where(corr_data==0, 0.00000001, corr_data)
mask = np.triu(np.ones_like(np.empty((len(corr_data), len(corr_data)))), 1)

seaborn.heatmap(
    np.round(np.corrcoef(corr_data), 2), 
    yticklabels=corr_data_labels, 
    xticklabels=corr_data_labels,
    annot=True, 
    fmt=".2f",
    mask=mask
)

plt.show()

corr_data = np.transpose(corr_data)

seaborn.heatmap(
    np.round(stats.spearmanr(corr_data)[0],2), 
    yticklabels=corr_data_labels, 
    xticklabels=corr_data_labels,
    annot=True, 
    fmt=".2f",
    mask=mask
)