In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, PercentFormatter
import seaborn as sns
import matplotlib.lines as mlines
from scipy.stats import wilcoxon, ranksums, mannwhitneyu
from statsmodels.stats.multitest import multipletests


from utils import get_df

In [4]:
ALGO_LIST = [
    "mcpg_me",
    "dcg_me",
    "pga_me",
    "me",
    "memes",
    "ppga",
]

In [6]:
def filter(df_row):
    if df_row["algo"] == "pga_me":
        if df_row["batch_size"] != 1024:
            return 

    if df_row["algo"] == "me":
        if df_row["batch_size"] != 8192:
            return 
        
    if df_row["algo"] == "ppga":
        if df_row["batch_size"] != 6000:
            return 
        
    if df_row["algo"] == "memes":
        if df_row["batch_size"] != 8192:
            return 
        
    if df_row["algo"] == "dcg_me":
        if df_row["batch_size"] != 2048:
            return 
        
    if df_row["algo"] == "mcpg_me":
        if df_row["batch_size"] != 4096:
            return 
        

        
    if df_row["algo"] == "mcpg_me":
        if df_row["proportion_mutation_ga"] == 0 and df_row["greedy"] == 0:
            return "mcpg_only"
    if df_row["algo"] == "mcpg_me":
        if df_row["proportion_mutation_ga"] == 0 and df_row["greedy"] == 0.5:
            return "mcpg_only_05"
        
    if df_row["algo"] == "mcpg_me":
        if df_row["proportion_mutation_ga"] == 0 and df_row["greedy"] == 1:
            return "mcpg_only_1"
        

    if df_row["algo"] == "mcpg_me":
        if df_row["proportion_mutation_ga"] == 0.5 and df_row["greedy"] == 0.5:
            return "mcpg_me_05"
        
    if df_row["algo"] == "mcpg_me":
        if df_row["proportion_mutation_ga"] == 0.5 and df_row["greedy"] == 1:
            return "mcpg_me_1"
        

    return df_row["algo"]

In [14]:
results_dir = Path("fig1/output/")
EPISODE_LENGTH = 250
df = get_df(results_dir, EPISODE_LENGTH)
df['algo'] = df.apply(filter, axis=1)
df = df[df["algo"].isin(ALGO_LIST)]

ant_omni_250
ant_uni_250
anttrap_omni_250
hopper_uni_250
walker2d_uni_250


### Function to compare p-values wrt samples

In [23]:
def compare_wrt_samples(df, algo_list, env_list, metric_list, sample_reference=1_008_000):
    p_value_list = [[metric, env, algo1, algo2] for metric in metric_list for env in env_list for idx, algo1 in enumerate(algo_list) for algo2 in algo_list[idx+1:]]
    df = df[df["algo"].isin(algo_list)]
    df = df[df["env"].isin(env_list)]
    df = df[df["num_evaluations"] <= sample_reference]

    idx = df.groupby(["env", "algo", "run"])["iteration"].idxmax()
    df = df.loc[idx]

    # Calculate percentage performance difference
    performance_diff = pd.DataFrame(columns=["metric", "env", "algo_1", "algo_2", "percentage_diff"])
    for metric in metric_list:
        for env in env_list:
            means = df[df["env"] == env].groupby("algo")[metric].median()
            for idx, algo_1 in enumerate(algo_list):
                for algo_2 in algo_list[idx+1:]:
                    mean_1 = means[algo_1]
                    mean_2 = means[algo_2]
                    if mean_1 > mean_2:
                        percentage_diff = ((mean_1 - mean_2) / mean_2) * 100
                        performance_diff.loc[len(performance_diff)] = {
                            "metric": metric,
                            "env": env,
                            "algo_1": algo_1,
                            "algo_2": algo_2,
                            "percentage_diff": percentage_diff
                        }
                    elif mean_2 > mean_1:
                        percentage_diff = ((mean_2 - mean_1) / mean_1) * 100
                        performance_diff.loc[len(performance_diff)] = {
                            "metric": metric,
                            "env": env,
                            "algo_1": algo_2,
                            "algo_2": algo_1,
                            "percentage_diff": percentage_diff
                        }

    # Calculate mean percentage difference across all environments
    mean_performance_diff = performance_diff.groupby(["metric", "algo_1", "algo_2"])["percentage_diff"].mean().reset_index()
    mean_performance_diff.rename(columns={"percentage_diff": "mean_percentage_diff"}, inplace=True)
# Compute p-values
    p_value_df = pd.DataFrame(columns=["metric", "env", "algo_1", "algo_2", "p_value"])
    for metric in metric_list:
        for env in env_list:
            for algo_1 in algo_list:
                for algo_2 in algo_list:
                    stat = mannwhitneyu(
                        df[(df["env"] == env) & (df["algo"] == algo_1)][metric],
                        df[(df["env"] == env) & (df["algo"] == algo_2)][metric],
                    )
                    p_value_df.loc[len(p_value_df)] = {"metric": metric, "env": env, "algo_1": algo_1, "algo_2": algo_2, "p_value": stat.pvalue}

    # Filter p-values
    p_value_df.set_index(["metric", "env", "algo_1", "algo_2"], inplace=True)
    p_value_df = p_value_df.loc[p_value_list]

    # Correct p-values
    p_value_df.reset_index(inplace=True)
    p_value_df["p_value_corrected"] = multipletests(p_value_df["p_value"], method="holm")[1]
    p_value_df = p_value_df.pivot(index=["env", "algo_1", "algo_2"], columns="metric", values="p_value_corrected")
    p_value_df.columns.name = None
    p_value_df = p_value_df.rename(columns={metric: "p-value" for metric in metric_list})

    #p_value_df.to_csv("p_value_results.csv")


    return p_value_df, performance_diff, mean_performance_diff



In [24]:
metric_list = ["time"]
env_list = ["hopper_uni_250", "walker2d_uni_250"]
algo_list = ["mcpg_me", "dcg_me"]
p_value_df, performance_diff, mean_performance_diff = compare_wrt_samples(df, algo_list, env_list, metric_list)
print(p_value_df)
print(performance_diff)
print(mean_performance_diff)



                                      p-value
env              algo_1  algo_2              
hopper_uni_250   mcpg_me dcg_me  3.302298e-08
walker2d_uni_250 mcpg_me dcg_me  3.302298e-08
  metric               env  algo_1   algo_2  percentage_diff
0   time    hopper_uni_250  dcg_me  mcpg_me       427.593184
1   time  walker2d_uni_250  dcg_me  mcpg_me       385.124350
  metric  algo_1   algo_2  mean_percentage_diff
0   time  dcg_me  mcpg_me            406.358767


### Function to compare p-values wrt runtime

In [21]:
def compare_wrt_runtime(df, algo_list, env_list, metric_list, ratio=1):
    p_value_list = [[metric, env, algo1, algo2] for metric in metric_list for env in env_list for idx, algo1 in enumerate(algo_list) for algo2 in algo_list[idx+1:]]
    env_one_mil = df[df["num_evaluations"] <= 1_008_000]
    max_time_per_env = env_one_mil.groupby('env')["time"].max()
    df = df[df.apply(lambda row: row['time'] <= max_time_per_env[row['env']] * ratio, axis=1)]
    df = df[df["algo"].isin(algo_list)]
    df = df[df["env"].isin(env_list)]
    idx = df.groupby(["env", "algo", "run"])["time"].idxmax()
    df = df.loc[idx]

    performance_diff = pd.DataFrame(columns=["metric", "env", "algo_1", "algo_2", "percentage_diff"])
    for metric in metric_list:
        for env in env_list:
            means = df[df["env"] == env].groupby("algo")[metric].median()
            for idx, algo_1 in enumerate(algo_list):
                for algo_2 in algo_list[idx+1:]:
                    mean_1 = means[algo_1]
                    mean_2 = means[algo_2]
                    if mean_1 > mean_2:
                        percentage_diff = ((mean_1 - mean_2) / mean_2) * 100
                        performance_diff.loc[len(performance_diff)] = {
                            "metric": metric,
                            "env": env,
                            "algo_1": algo_1,
                            "algo_2": algo_2,
                            "percentage_diff": percentage_diff
                        }
                    elif mean_2 > mean_1:
                        percentage_diff = ((mean_2 - mean_1) / mean_1) * 100
                        performance_diff.loc[len(performance_diff)] = {
                            "metric": metric,
                            "env": env,
                            "algo_1": algo_2,
                            "algo_2": algo_1,
                            "percentage_diff": percentage_diff
                        }

    # Calculate mean percentage difference across all environments
    mean_performance_diff = performance_diff.groupby(["metric", "algo_1", "algo_2"])["percentage_diff"].mean().reset_index()
    mean_performance_diff.rename(columns={"percentage_diff": "mean_percentage_diff"}, inplace=True)
    # Compute p-values
    p_value_df = pd.DataFrame(columns=["metric", "env", "algo_1", "algo_2", "p_value"])
    for metric in metric_list:
        for env in env_list:
            for algo_1 in algo_list:
                for algo_2 in algo_list:
                    stat = mannwhitneyu(
                        df[(df["env"] == env) & (df["algo"] == algo_1)][metric],
                        df[(df["env"] == env) & (df["algo"] == algo_2)][metric],
                    )
                    p_value_df.loc[len(p_value_df)] = {"metric": metric, "env": env, "algo_1": algo_1, "algo_2": algo_2, "p_value": stat.pvalue}

    # Filter p-values
    p_value_df.set_index(["metric", "env", "algo_1", "algo_2"], inplace=True)
    p_value_df = p_value_df.loc[p_value_list]

    # Correct p-values
    p_value_df.reset_index(inplace=True)
    p_value_df["p_value_corrected"] = multipletests(p_value_df["p_value"], method="holm")[1]
    p_value_df = p_value_df.pivot(index=["env", "algo_1", "algo_2"], columns="metric", values="p_value_corrected")
    p_value_df.columns.name = None
    p_value_df = p_value_df.rename(columns={metric: "p-value" for metric in metric_list})

    return p_value_df, performance_diff, mean_performance_diff






In [22]:
metric_list = ["qd_score"]
env_list = ["ant_omni_250", "anttrap_omni_250"]
algo_list = ["mcpg_me", "dcg_me","pga_me"]
p_value_df, performance_diff, mean_performance_diff = compare_wrt_runtime(df, algo_list, env_list, metric_list, ratio=0.05)
print(p_value_df)
print(performance_diff)
print(mean_performance_diff)

                                      p-value
env              algo_1  algo_2              
ant_omni_250     dcg_me  pga_me  1.825329e-06
                 mcpg_me dcg_me  7.231514e-08
                         pga_me  7.231514e-08
anttrap_omni_250 dcg_me  pga_me  2.355659e-06
                 mcpg_me dcg_me  7.231514e-08
                         pga_me  7.231514e-08
     metric               env   algo_1  algo_2  percentage_diff
0  qd_score      ant_omni_250  mcpg_me  dcg_me       415.616035
1  qd_score      ant_omni_250  mcpg_me  pga_me       491.209650
2  qd_score      ant_omni_250   dcg_me  pga_me        14.660837
3  qd_score  anttrap_omni_250  mcpg_me  dcg_me       299.752862
4  qd_score  anttrap_omni_250  mcpg_me  pga_me       355.304934
5  qd_score  anttrap_omni_250   dcg_me  pga_me        13.896604
     metric   algo_1  algo_2  mean_percentage_diff
0  qd_score   dcg_me  pga_me             14.278720
1  qd_score  mcpg_me  dcg_me            357.684449
2  qd_score  mcpg_me  pga_me   