# SOA Replication and Changes

In [167]:
import os

import duckdb
import numpy as np
import pandas as pd
from deltalake import DeltaTable

In [52]:
DELTA_PATH = "../results/"

In [154]:
soa = pd.read_csv("soa_processed.csv")

In [None]:
experiments = {f: None for f in os.listdir(DELTA_PATH)}

In [46]:
def list_delta_table_paths(base_path):
    """
    Traverse the base_path and return a list of paths that contain Delta Tables.
    A directory is assumed to be a Delta Table if it contains a _delta_log folder.
    """
    delta_paths = []
    for root, dirs, files in os.walk(base_path):
        if '_delta_log' in dirs:
            delta_paths.append(root)
            dirs[:] = []  
    return delta_paths

def read_all_delta_tables(base_path):
    """
    Read and return all Delta Tables found under the base_path.
    Returns a dictionary where keys are table paths and values are DeltaTable objects.
    """
    delta_paths = list_delta_table_paths(base_path)
    tables = {}
    for path in delta_paths:
        try:
            dt = DeltaTable(path)
            tables[path] = dt
            # print(f"Loaded Delta Table: {path}")
        except Exception as e:
            print(f"Failed to read Delta Table at {path}: {e}")
    return tables

def get_experiment(base_path):
    delta_tables = read_all_delta_tables(base_path)
    df = pd.DataFrame()
    for _, table in delta_tables.items():
        df = pd.concat([df, table.to_pandas()], ignore_index=True)

    return df

In [54]:
for experiment in experiments.keys():
    print(f"Processing experiment: {experiment}")
    experiments[experiment] = get_experiment(os.path.join(DELTA_PATH, experiment))

Processing experiment: ordered_crossover
Processing experiment: variable_mutate_rate
Processing experiment: soa_replication
Processing experiment: replacement_operator__ordered_crossover
Processing experiment: variable_mutate_rate__replacement_operator
Processing experiment: replacement_operator
Processing experiment: variable_mutate_rate__ordered_crossover
Processing experiment: all_changes


In [243]:
def process_experiments(df: pd.DataFrame, experiment_name: str) -> pd.DataFrame:
    """
    Process the experiments dictionary to create a DataFrame with all results.
    """
    df = (
        df
        .assign(
            instance_symmetry=lambda x: x["instance_name"].str.split(".").str[1].str.upper(),
            instance_name=lambda x: x["instance_name"].str.split(".").str[0],
        )
        .groupby([
            "experiment_id", "experiment_name", "instance_name", "instance_symmetry", 
            "k_factor", "repetitions", "model_name", "has_closed_cycle", "k_size"
        ])
        .agg(
            min_path_length=("path_length", "min"),
            max_path_length=("path_length", "max"),
            avg_path_length=("path_length", "mean"),
            std_path_length=("path_length", "std")
        )
        .reset_index()
        .loc[lambda x: x["model_name"] == "GeneticAlgorithmNearestNeighborsEnsemble"]
        .pivot(
            index=[
                "experiment_name", "instance_name", "instance_symmetry", 
                "k_factor", "repetitions", "has_closed_cycle", "k_size"
            ],
            columns="model_name",
            values=["min_path_length", "avg_path_length", "std_path_length"]
        )
        .reset_index()
    )
    df.columns = ['_'.join([str(i) for i in col if i]) if isinstance(col, tuple) else col for col in df.columns.values]
    df = df.rename(columns={
        "max_path_length_GeneticAlgorithmNearestNeighborsEnsemble": f"max_our_{experiment_name}",
        "min_path_length_GeneticAlgorithmNearestNeighborsEnsemble": f"min_our_{experiment_name}",
        "avg_path_length_GeneticAlgorithmNearestNeighborsEnsemble": f"avg_our_{experiment_name}",
        "std_path_length_GeneticAlgorithmNearestNeighborsEnsemble": f"std_our_{experiment_name}"
    })
    return df

In [246]:
processed_experiments = {experiment: process_experiments(df, experiment) for experiment, df in experiments.items()}

In [251]:
processed_experiments["soa_replication"]

Unnamed: 0,experiment_name,instance_name,instance_symmetry,k_factor,repetitions,has_closed_cycle,k_size,min_our_soa_replication,avg_our_soa_replication,std_our_soa_replication
0,soa_replication,a280,TSP,0.25,10,False,70,607.0,607.0,0.000000
1,soa_replication,a280,TSP,0.25,10,True,70,619.0,619.0,0.000000
2,soa_replication,a280,TSP,0.50,10,False,140,1282.0,1282.0,0.000000
3,soa_replication,a280,TSP,0.50,10,True,140,1371.0,1371.0,0.000000
4,soa_replication,a280,TSP,0.75,10,False,210,1966.0,1966.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
565,soa_replication,ulysses22,TSP,0.25,10,True,5,182.0,182.0,0.000000
566,soa_replication,ulysses22,TSP,0.50,10,False,11,1438.0,1438.0,0.000000
567,soa_replication,ulysses22,TSP,0.50,10,True,11,1428.0,1430.1,1.449138
568,soa_replication,ulysses22,TSP,0.75,10,False,16,2658.0,2658.0,0.000000


## GA SOA vs Our SOA Replication

In [252]:
soa_replication_analysis = (
    processed_experiments["soa_replication"]
    .merge(
        soa[
            ["instance_name", "k_factor", "has_closed_cycle", "ga_w_nn"]
        ],
        on=["instance_name", "k_factor", "has_closed_cycle"],
        how="left"
    )
    .assign(
        abs_gap=lambda x: x["min_our_soa_replication"] - x["ga_w_nn"],
        percentage_gap=lambda x: np.where(
            x["ga_w_nn"] != 0,
            (x["abs_gap"] / x["ga_w_nn"]),
            np.nan
        )
    )
)

In [253]:
soa_replication_analysis

Unnamed: 0,experiment_name,instance_name,instance_symmetry,k_factor,repetitions,has_closed_cycle,k_size,min_our_soa_replication,avg_our_soa_replication,std_our_soa_replication,ga_w_nn,abs_gap,percentage_gap
0,soa_replication,a280,TSP,0.25,10,False,70,607.0,607.0,0.000000,606.0,1.0,0.001650
1,soa_replication,a280,TSP,0.25,10,True,70,619.0,619.0,0.000000,686.0,-67.0,-0.097668
2,soa_replication,a280,TSP,0.50,10,False,140,1282.0,1282.0,0.000000,1234.0,48.0,0.038898
3,soa_replication,a280,TSP,0.50,10,True,140,1371.0,1371.0,0.000000,1358.0,13.0,0.009573
4,soa_replication,a280,TSP,0.75,10,False,210,1966.0,1966.0,0.000000,1894.0,72.0,0.038015
...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,soa_replication,ulysses22,TSP,0.25,10,True,5,182.0,182.0,0.000000,747.0,-565.0,-0.756359
566,soa_replication,ulysses22,TSP,0.50,10,False,11,1438.0,1438.0,0.000000,1473.0,-35.0,-0.023761
567,soa_replication,ulysses22,TSP,0.50,10,True,11,1428.0,1430.1,1.449138,1902.0,-474.0,-0.249211
568,soa_replication,ulysses22,TSP,0.75,10,False,16,2658.0,2658.0,0.000000,2618.0,40.0,0.015279


In [254]:
soa_replication_analysis.groupby(["instance_name"]).agg(
    avg_abs_gap=("abs_gap", "mean"),
    avg_percentage_gap=("percentage_gap", "mean")
).sort_values(by="avg_percentage_gap", ascending=False).reset_index()

Unnamed: 0,instance_name,avg_abs_gap,avg_percentage_gap
0,swiss42,255.833333,1.365903
1,pr136,6654.000000,0.134713
2,kroB200,1896.000000,0.133044
3,gr96,2665.000000,0.115510
4,bier127,2840.666667,0.096011
...,...,...,...
90,si535,,
91,ts225,,
92,tsp225,,
93,u574,,


In [255]:
soa_replication_analysis.groupby(["k_factor"]).agg(
    avg_abs_gap=("abs_gap", "mean"),
    avg_percentage_gap=("percentage_gap", "mean")
).sort_values(by="avg_percentage_gap", ascending=False).reset_index()

Unnamed: 0,k_factor,avg_abs_gap,avg_percentage_gap
0,0.75,975.636364,0.062472
1,0.5,-26.39,0.019158
2,0.25,-456.11,-0.111535


In [212]:
soa_replication_analysis.groupby(["has_closed_cycle"]).agg(
    avg_abs_gap=("abs_gap", "mean"),
    avg_percentage_gap=("percentage_gap", "mean")
).sort_values(by="avg_percentage_gap", ascending=False).reset_index()

Unnamed: 0,has_closed_cycle,avg_abs_gap,avg_percentage_gap
0,False,-36.6,-0.005697
1,True,361.261745,-0.014754


In [213]:
soa_replication_analysis[["abs_gap", "percentage_gap"]].mean()

abs_gap           161.665552
percentage_gap     -0.010211
dtype: float64

## SOA Replication x Changes

In [214]:
df = processed_experiments["soa_replication"].copy()
for experiment_name in processed_experiments.keys():
    if experiment_name != "soa_replication":
        df = df.merge(
            processed_experiments[experiment_name][
                ["instance_name", "k_factor", "has_closed_cycle", f"our_{experiment_name}"]
            ],
            on=["instance_name", "k_factor", "has_closed_cycle"],
            how="left",
            suffixes=("", f"_{experiment_name}")
        )

In [215]:
df.columns

Index(['experiment_name', 'instance_name', 'instance_symmetry', 'k_factor',
       'repetitions', 'has_closed_cycle', 'k_size', 'our_soa_replication',
       'our_ordered_crossover', 'our_variable_mutate_rate',
       'our_replacement_operator__ordered_crossover',
       'our_variable_mutate_rate__replacement_operator',
       'our_replacement_operator',
       'our_variable_mutate_rate__ordered_crossover', 'our_all_changes'],
      dtype='object', name='model_name')

In [216]:
changes_p_gap = (
    df
    .assign(
        ordered_crossover_p_gap=lambda x: (
            (x["our_ordered_crossover"] - x["our_soa_replication"]) / x["our_soa_replication"]
        ),
        variable_mutate_rate_p_gap=lambda x: (
            (x["our_variable_mutate_rate"] - x["our_soa_replication"]) / x["our_soa_replication"]
        ),
        replacement_operator_p_gap=lambda x: (
            (x["our_replacement_operator"] - x["our_soa_replication"]) / x["our_soa_replication"]
        ),
        replacement_operator__ordered_crossover_p_gap=lambda x: (
            (x["our_replacement_operator__ordered_crossover"] - x["our_soa_replication"]) / x["our_soa_replication"]
        ),
        variable_mutate_rate__replacement_operator_p_gap=lambda x: (
            (x["our_variable_mutate_rate__replacement_operator"] - x["our_soa_replication"]) / x["our_soa_replication"]
        ),
        variable_mutate_rate__ordered_crossover_p_gap=lambda x: (
            (x["our_variable_mutate_rate__ordered_crossover"] - x["our_soa_replication"]) / x["our_soa_replication"]
        ),
        all_changes_p_gap= lambda x: (
            (x["our_all_changes"] - x["our_soa_replication"]) / x["our_soa_replication"]
        )
    )
    [[
        "instance_name", "instance_symmetry", "k_factor", "has_closed_cycle",
        "k_size", "ordered_crossover_p_gap",
        "variable_mutate_rate_p_gap", "replacement_operator_p_gap",
        "replacement_operator__ordered_crossover_p_gap",
        "variable_mutate_rate__replacement_operator_p_gap",
        "variable_mutate_rate__ordered_crossover_p_gap", "all_changes_p_gap"
    ]]
)

In [217]:
changes_p_gap.groupby(["instance_name"]).agg(
    avg_ordered_crossover_p_gap=("ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate_p_gap=("variable_mutate_rate_p_gap", "mean"),
    avg_replacement_operator_p_gap=("replacement_operator_p_gap", "mean"),
    avg_replacement_operator__ordered_crossover_p_gap=("replacement_operator__ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate__replacement_operator_p_gap=("variable_mutate_rate__replacement_operator_p_gap", "mean"),
    avg_variable_mutate_rate__ordered_crossover_p_gap=("variable_mutate_rate__ordered_crossover_p_gap", "mean"),
    avg_all_changes_p_gap=("all_changes_p_gap", "mean")
).reset_index()

Unnamed: 0,instance_name,avg_ordered_crossover_p_gap,avg_variable_mutate_rate_p_gap,avg_replacement_operator_p_gap,avg_replacement_operator__ordered_crossover_p_gap,avg_variable_mutate_rate__replacement_operator_p_gap,avg_variable_mutate_rate__ordered_crossover_p_gap,avg_all_changes_p_gap
0,a280,-0.001195,0.0,0.000000,-0.001195,0.000000,-0.001195,-0.001195
1,ali535,0.000219,0.0,0.000219,0.000219,0.000219,0.000219,0.000219
2,att48,-0.005234,0.0,0.004798,-0.002063,0.004798,-0.002063,-0.002063
3,att532,-0.001478,0.0,-0.002327,-0.001478,-0.002327,-0.001478,-0.001478
4,bayg29,-0.000833,0.0,0.000000,-0.000833,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
90,u159,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
91,u574,-0.000362,0.0,0.000000,-0.000362,0.000000,-0.000362,-0.000362
92,u724,-0.000954,0.0,0.000000,-0.000954,0.000000,-0.000954,-0.000954
93,ulysses16,0.003429,0.0,0.000400,-0.004431,0.000400,0.006113,0.003429


In [218]:
changes_p_gap.groupby(["instance_symmetry"]).agg(
    avg_ordered_crossover_p_gap=("ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate_p_gap=("variable_mutate_rate_p_gap", "mean"),
    avg_replacement_operator_p_gap=("replacement_operator_p_gap", "mean"),
    avg_replacement_operator__ordered_crossover_p_gap=("replacement_operator__ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate__replacement_operator_p_gap=("variable_mutate_rate__replacement_operator_p_gap", "mean"),
    avg_variable_mutate_rate__ordered_crossover_p_gap=("variable_mutate_rate__ordered_crossover_p_gap", "mean"),
    avg_all_changes_p_gap=("all_changes_p_gap", "mean")
).reset_index()

Unnamed: 0,instance_symmetry,avg_ordered_crossover_p_gap,avg_variable_mutate_rate_p_gap,avg_replacement_operator_p_gap,avg_replacement_operator__ordered_crossover_p_gap,avg_variable_mutate_rate__replacement_operator_p_gap,avg_variable_mutate_rate__ordered_crossover_p_gap,avg_all_changes_p_gap
0,ATSP,-0.006218,0.0,-0.008088,-0.006994,-0.008088,-0.00583,-0.006463
1,TSP,-0.004677,0.0,-0.002843,-0.004743,-0.002843,-0.002756,-0.002917


In [219]:
changes_p_gap.groupby(["k_factor"]).agg(
    avg_ordered_crossover_p_gap=("ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate_p_gap=("variable_mutate_rate_p_gap", "mean"),
    avg_replacement_operator_p_gap=("replacement_operator_p_gap", "mean"),
    avg_replacement_operator__ordered_crossover_p_gap=("replacement_operator__ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate__replacement_operator_p_gap=("variable_mutate_rate__replacement_operator_p_gap", "mean"),
    avg_variable_mutate_rate__ordered_crossover_p_gap=("variable_mutate_rate__ordered_crossover_p_gap", "mean"),
    avg_all_changes_p_gap=("all_changes_p_gap", "mean")
).reset_index()

Unnamed: 0,k_factor,avg_ordered_crossover_p_gap,avg_variable_mutate_rate_p_gap,avg_replacement_operator_p_gap,avg_replacement_operator__ordered_crossover_p_gap,avg_variable_mutate_rate__replacement_operator_p_gap,avg_variable_mutate_rate__ordered_crossover_p_gap,avg_all_changes_p_gap
0,0.25,-0.002547,0.0,-0.003867,-0.003101,-0.003867,-0.00237,-0.002196
1,0.5,-0.002745,0.0,-0.001373,-0.002766,-0.001373,-0.002471,-0.002915
2,0.75,-0.009625,0.0,-0.006393,-0.009672,-0.006393,-0.005236,-0.005724


In [220]:
changes_p_gap.groupby(["has_closed_cycle"]).agg(
    avg_ordered_crossover_p_gap=("ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate_p_gap=("variable_mutate_rate_p_gap", "mean"),
    avg_replacement_operator_p_gap=("replacement_operator_p_gap", "mean"),
    avg_replacement_operator__ordered_crossover_p_gap=("replacement_operator__ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate__replacement_operator_p_gap=("variable_mutate_rate__replacement_operator_p_gap", "mean"),
    avg_variable_mutate_rate__ordered_crossover_p_gap=("variable_mutate_rate__ordered_crossover_p_gap", "mean"),
    avg_all_changes_p_gap=("all_changes_p_gap", "mean")
).reset_index()

Unnamed: 0,has_closed_cycle,avg_ordered_crossover_p_gap,avg_variable_mutate_rate_p_gap,avg_replacement_operator_p_gap,avg_replacement_operator__ordered_crossover_p_gap,avg_variable_mutate_rate__replacement_operator_p_gap,avg_variable_mutate_rate__ordered_crossover_p_gap,avg_all_changes_p_gap
0,False,-0.00318,0.0,-0.002796,-0.003767,-0.002796,-0.003314,-0.00396
1,True,-0.006782,0.0,-0.004959,-0.006607,-0.004959,-0.003411,-0.003273


In [224]:
changes_p_gap[[
    "ordered_crossover_p_gap", "variable_mutate_rate_p_gap",
    "replacement_operator_p_gap", "replacement_operator__ordered_crossover_p_gap",
    "variable_mutate_rate__replacement_operator_p_gap",
    "variable_mutate_rate__ordered_crossover_p_gap", "all_changes_p_gap"
]].mean().sort_values(ascending=True)

model_name
replacement_operator__ordered_crossover_p_gap      -0.005187
ordered_crossover_p_gap                            -0.004981
replacement_operator_p_gap                         -0.003878
variable_mutate_rate__replacement_operator_p_gap   -0.003878
all_changes_p_gap                                  -0.003616
variable_mutate_rate__ordered_crossover_p_gap      -0.003362
variable_mutate_rate_p_gap                          0.000000
dtype: float64

## SOA x Ours

In [None]:
# benchmark = processed_experiments["soa_replication"].copy()
# for experiment_name in processed_experiments.keys():
#     if experiment_name != "soa_replication":
#         benchmark = benchmark.merge(
#             processed_experiments[experiment_name][
#                 ["instance_name", "k_factor", "has_closed_cycle", f"our_{experiment_name}"]
#             ],
#             on=["instance_name", "k_factor", "has_closed_cycle"],
#             how="left",
#             suffixes=("", f"_{experiment_name}")
#         )
# benchmark = benchmark.merge(
#     soa[
#         ["instance_name", "k_factor", "has_closed_cycle", "gvns", 
#             "hh_rand", "hh_greedy", "ga_w_nn"]
#     ],
#     on=["instance_name", "k_factor", "has_closed_cycle"],
#     how="left",
#     suffixes=("_replication", "_soa")
# )

In [172]:
# (
#     benchmark
#     .assign(
#         state_of_the_art=lambda x: np.nanmin(
#             x[["gvns", "hh_rand", "hh_greedy", "ga_w_nn"]].values, axis=1
#         ),
#         abs_gap=lambda x: x["our_soa_replication"] - x["state_of_the_art"],
#     )
# )

  state_of_the_art=lambda x: np.nanmin(


Unnamed: 0,experiment_name,instance_name,instance_symmetry,k_factor,repetitions,has_closed_cycle,k_size,our_soa_replication,our_ordered_crossover,our_variable_mutate_rate,...,our_variable_mutate_rate__replacement_operator,our_replacement_operator,our_variable_mutate_rate__ordered_crossover,our_all_changes,gvns,hh_rand,hh_greedy,ga_w_nn,state_of_the_art,our_soa_difference
0,soa_replication,a280,TSP,0.25,10,False,70,607,607,607,...,607,607,607,607,,,,606.0,606.0,1.0
1,soa_replication,a280,TSP,0.25,10,True,70,619,619,619,...,619,619,619,619,687.0,670.0,683.0,686.0,670.0,-67.0
2,soa_replication,a280,TSP,0.50,10,False,140,1282,1282,1282,...,1282,1282,1282,1282,,,,1234.0,1234.0,48.0
3,soa_replication,a280,TSP,0.50,10,True,140,1371,1365,1371,...,1371,1371,1365,1365,1376.0,1314.0,1362.0,1358.0,1314.0,13.0
4,soa_replication,a280,TSP,0.75,10,False,210,1966,1966,1966,...,1966,1966,1966,1966,,,,1894.0,1894.0,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,soa_replication,ulysses22,TSP,0.25,10,True,5,182,182,182,...,182,182,182,182,747.0,747.0,747.0,747.0,747.0,-565.0
566,soa_replication,ulysses22,TSP,0.50,10,False,11,1438,1438,1438,...,1438,1438,1351,1438,,,,1473.0,1473.0,-35.0
567,soa_replication,ulysses22,TSP,0.50,10,True,11,1428,1428,1428,...,1428,1428,1428,1428,1902.0,1902.0,1902.0,1902.0,1902.0,-474.0
568,soa_replication,ulysses22,TSP,0.75,10,False,16,2658,2241,2658,...,2520,2520,2241,2241,,,,2618.0,2618.0,40.0
