# SOA Replication and Changes

In [13]:
import os

import duckdb
import numpy as np
import pandas as pd
from deltalake import DeltaTable

In [14]:
DELTA_PATH = "../results/"

In [None]:
soa = pd.read_csv(f"{DELTA_PATH}/soa_processed.csv")

In [39]:
experiments = {f: None for f in os.listdir(DELTA_PATH) if f.startswith("change_") or f.startswith("soa_")}

In [40]:
def list_delta_table_paths(base_path):
    """
    Traverse the base_path and return a list of paths that contain Delta Tables.
    A directory is assumed to be a Delta Table if it contains a _delta_log folder.
    """
    delta_paths = []
    for root, dirs, files in os.walk(base_path):
        if '_delta_log' in dirs:
            delta_paths.append(root)
            dirs[:] = []  
    return delta_paths

def read_all_delta_tables(base_path):
    """
    Read and return all Delta Tables found under the base_path.
    Returns a dictionary where keys are table paths and values are DeltaTable objects.
    """
    delta_paths = list_delta_table_paths(base_path)
    tables = {}
    for path in delta_paths:
        try:
            dt = DeltaTable(path)
            tables[path] = dt
            # print(f"Loaded Delta Table: {path}")
        except Exception as e:
            print(f"Failed to read Delta Table at {path}: {e}")
    return tables

def get_experiment(base_path):
    delta_tables = read_all_delta_tables(base_path)
    df = pd.DataFrame()
    for _, table in delta_tables.items():
        df = pd.concat([df, table.to_pandas()], ignore_index=True)

    return df

In [41]:
for experiment in experiments.keys():
    print(f"Processing experiment: {experiment}")
    experiments[experiment] = get_experiment(os.path.join(DELTA_PATH, experiment))

Processing experiment: change__replace_operator
Processing experiment: change__ordered_crossover
Processing experiment: change__variable_mutate_rate__replace_operator
Processing experiment: change__all_changes
Processing experiment: soa_replication
Processing experiment: change__variable_mutate_rate__ordered_crossover
Processing experiment: change__replace_operator__ordered_crossover
Processing experiment: change__variable_mutate_rate


In [42]:
def process_experiments(df: pd.DataFrame, experiment_name: str) -> pd.DataFrame:
    """
    Process the experiments dictionary to create a DataFrame with all results.
    """
    df = (
        df
        .assign(
            instance_symmetry=lambda x: x["instance_name"].str.split(".").str[1].str.upper(),
            instance_name=lambda x: x["instance_name"].str.split(".").str[0],
        )
        .groupby([
            "experiment_id", "experiment_name", "instance_name", "instance_symmetry", 
            "k_factor", "repetitions", "model_name", "has_closed_cycle", "k_size"
        ])
        .agg(
            min_path_length=("path_length", "min"),
            max_path_length=("path_length", "max"),
            avg_path_length=("path_length", "mean"),
            std_path_length=("path_length", "std")
        )
        .reset_index()
        .loc[lambda x: x["model_name"] == "GeneticAlgorithmNearestNeighborsEnsemble"]
        .pivot(
            index=[
                "experiment_name", "instance_name", "instance_symmetry", 
                "k_factor", "repetitions", "has_closed_cycle", "k_size"
            ],
            columns="model_name",
            values=["min_path_length", "avg_path_length", "std_path_length"]
        )
        .reset_index()
    )
    df.columns = ['_'.join([str(i) for i in col if i]) if isinstance(col, tuple) else col for col in df.columns.values]
    df = df.rename(columns={
        "max_path_length_GeneticAlgorithmNearestNeighborsEnsemble": f"max_our_{experiment_name}",
        "min_path_length_GeneticAlgorithmNearestNeighborsEnsemble": f"min_our_{experiment_name}",
        "avg_path_length_GeneticAlgorithmNearestNeighborsEnsemble": f"avg_our_{experiment_name}",
        "std_path_length_GeneticAlgorithmNearestNeighborsEnsemble": f"std_our_{experiment_name}"
    })
    return df

In [43]:
processed_experiments = {experiment: process_experiments(df, experiment) for experiment, df in experiments.items()}

In [44]:
processed_experiments["soa_replication"]

Unnamed: 0,experiment_name,instance_name,instance_symmetry,k_factor,repetitions,has_closed_cycle,k_size,min_our_soa_replication,avg_our_soa_replication,std_our_soa_replication
0,soa_replication,a280,TSP,0.25,10,False,70,591.0,591.0,0.000000
1,soa_replication,a280,TSP,0.25,10,True,70,638.0,638.0,0.000000
2,soa_replication,a280,TSP,0.50,10,False,140,1270.0,1270.0,0.000000
3,soa_replication,a280,TSP,0.50,10,True,140,1367.0,1367.0,0.000000
4,soa_replication,a280,TSP,0.75,10,False,210,1966.0,1966.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
565,soa_replication,ulysses22,TSP,0.25,10,True,5,663.0,663.0,0.000000
566,soa_replication,ulysses22,TSP,0.50,10,False,11,1438.0,1438.0,0.000000
567,soa_replication,ulysses22,TSP,0.50,10,True,11,1923.0,2049.1,121.411559
568,soa_replication,ulysses22,TSP,0.75,10,False,16,2780.0,2780.0,0.000000


## GA SOA vs Our SOA Replication

In [45]:
soa_replication_analysis = (
    processed_experiments["soa_replication"]
    .merge(
        soa[
            ["instance_name", "k_factor", "has_closed_cycle", "ga_w_nn"]
        ],
        on=["instance_name", "k_factor", "has_closed_cycle"],
        how="left"
    )
    .assign(
        abs_gap=lambda x: x["min_our_soa_replication"] - x["ga_w_nn"],
        percentage_gap=lambda x: np.where(
            x["ga_w_nn"] != 0,
            (x["abs_gap"] / x["ga_w_nn"]),
            np.nan
        )
    )
)

In [46]:
soa_replication_analysis

Unnamed: 0,experiment_name,instance_name,instance_symmetry,k_factor,repetitions,has_closed_cycle,k_size,min_our_soa_replication,avg_our_soa_replication,std_our_soa_replication,ga_w_nn,abs_gap,percentage_gap
0,soa_replication,a280,TSP,0.25,10,False,70,591.0,591.0,0.000000,606.0,-15.0,-0.024752
1,soa_replication,a280,TSP,0.25,10,True,70,638.0,638.0,0.000000,686.0,-48.0,-0.069971
2,soa_replication,a280,TSP,0.50,10,False,140,1270.0,1270.0,0.000000,1234.0,36.0,0.029173
3,soa_replication,a280,TSP,0.50,10,True,140,1367.0,1367.0,0.000000,1358.0,9.0,0.006627
4,soa_replication,a280,TSP,0.75,10,False,210,1966.0,1966.0,0.000000,1894.0,72.0,0.038015
...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,soa_replication,ulysses22,TSP,0.25,10,True,5,663.0,663.0,0.000000,747.0,-84.0,-0.112450
566,soa_replication,ulysses22,TSP,0.50,10,False,11,1438.0,1438.0,0.000000,1473.0,-35.0,-0.023761
567,soa_replication,ulysses22,TSP,0.50,10,True,11,1923.0,2049.1,121.411559,1902.0,21.0,0.011041
568,soa_replication,ulysses22,TSP,0.75,10,False,16,2780.0,2780.0,0.000000,2618.0,162.0,0.061879


In [47]:
soa_replication_analysis.groupby(["instance_name"]).agg(
    avg_abs_gap=("abs_gap", "mean"),
    avg_percentage_gap=("percentage_gap", "mean")
).sort_values(by="avg_percentage_gap", ascending=False).reset_index()

Unnamed: 0,instance_name,avg_abs_gap,avg_percentage_gap
0,swiss42,279.166667,1.505611
1,pr136,6918.500000,0.146642
2,burma14,110.333333,0.136879
3,kroB200,1855.333333,0.131760
4,gr96,2760.833333,0.128669
...,...,...,...
90,si535,,
91,ts225,,
92,tsp225,,
93,u574,,


In [48]:
soa_replication_analysis.groupby(["k_factor"]).agg(
    avg_abs_gap=("abs_gap", "mean"),
    avg_percentage_gap=("percentage_gap", "mean")
).sort_values(by="avg_percentage_gap", ascending=False).reset_index()

Unnamed: 0,k_factor,avg_abs_gap,avg_percentage_gap
0,0.75,1010.292929,0.080519
1,0.5,44.85,0.043687
2,0.25,-384.3,-0.058681


In [49]:
soa_replication_analysis.groupby(["has_closed_cycle"]).agg(
    avg_abs_gap=("abs_gap", "mean"),
    avg_percentage_gap=("percentage_gap", "mean")
).sort_values(by="avg_percentage_gap", ascending=False).reset_index()

Unnamed: 0,has_closed_cycle,avg_abs_gap,avg_percentage_gap
0,True,548.563758,0.052683
1,False,-104.413333,-0.009185


In [50]:
soa_replication_analysis[["abs_gap", "percentage_gap"]].mean()

abs_gap           220.983278
percentage_gap      0.021645
dtype: float64

## SOA Replication x Changes

In [51]:
processed_experiments["change__replace_operator"]

Unnamed: 0,experiment_name,instance_name,instance_symmetry,k_factor,repetitions,has_closed_cycle,k_size,min_our_change__replace_operator,avg_our_change__replace_operator,std_our_change__replace_operator
0,change__replace_operator,a280,TSP,0.25,30,False,70,591.0,591.000000,0.000000
1,change__replace_operator,a280,TSP,0.25,30,True,70,636.0,637.666667,0.758098
2,change__replace_operator,a280,TSP,0.50,30,False,140,1260.0,1268.833333,2.520035
3,change__replace_operator,a280,TSP,0.50,30,True,140,1362.0,1366.800000,0.924755
4,change__replace_operator,a280,TSP,0.75,30,False,210,1946.0,1961.666667,8.172002
...,...,...,...,...,...,...,...,...,...,...
565,change__replace_operator,ulysses22,TSP,0.25,30,True,5,637.0,637.866667,4.746929
566,change__replace_operator,ulysses22,TSP,0.50,30,False,11,1330.0,1434.400000,19.718012
567,change__replace_operator,ulysses22,TSP,0.50,30,True,11,1919.0,2120.666667,90.904775
568,change__replace_operator,ulysses22,TSP,0.75,30,False,16,2221.0,2742.866667,111.357063


In [52]:
df = processed_experiments["soa_replication"].copy()
for experiment_name in processed_experiments.keys():
    if experiment_name != "soa_replication":
        df = df.merge(
            processed_experiments[experiment_name][
                [
                    "instance_name", "k_factor", "has_closed_cycle", 
                    f"min_our_{experiment_name}", f"avg_our_{experiment_name}", f"std_our_{experiment_name}"
                ]
            ],
            on=["instance_name", "k_factor", "has_closed_cycle"],
            how="left",
            suffixes=("", f"_{experiment_name}")
        )

In [53]:
df.columns

Index(['experiment_name', 'instance_name', 'instance_symmetry', 'k_factor',
       'repetitions', 'has_closed_cycle', 'k_size', 'min_our_soa_replication',
       'avg_our_soa_replication', 'std_our_soa_replication',
       'min_our_change__replace_operator', 'avg_our_change__replace_operator',
       'std_our_change__replace_operator', 'min_our_change__ordered_crossover',
       'avg_our_change__ordered_crossover',
       'std_our_change__ordered_crossover',
       'min_our_change__variable_mutate_rate__replace_operator',
       'avg_our_change__variable_mutate_rate__replace_operator',
       'std_our_change__variable_mutate_rate__replace_operator',
       'min_our_change__all_changes', 'avg_our_change__all_changes',
       'std_our_change__all_changes',
       'min_our_change__variable_mutate_rate__ordered_crossover',
       'avg_our_change__variable_mutate_rate__ordered_crossover',
       'std_our_change__variable_mutate_rate__ordered_crossover',
       'min_our_change__replace_opera

In [56]:
changes_p_gap = (
    df
    .assign(
        ordered_crossover_p_gap=lambda x: (
            (x["min_our_change__ordered_crossover"] - x["min_our_soa_replication"]) / x["min_our_soa_replication"]
        ),
        variable_mutate_rate_p_gap=lambda x: (
            (x["min_our_change__variable_mutate_rate"] - x["min_our_soa_replication"]) / x["min_our_soa_replication"]
        ),
        replace_operator_p_gap=lambda x: (
            (x["min_our_change__replace_operator"] - x["min_our_soa_replication"]) / x["min_our_soa_replication"]
        ),
        replace_operator__ordered_crossover_p_gap=lambda x: (
            (x["min_our_change__replace_operator__ordered_crossover"] - x["min_our_soa_replication"]) / x["min_our_soa_replication"]
        ),
        variable_mutate_rate__replace_operator_p_gap=lambda x: (
            (x["min_our_change__variable_mutate_rate__replace_operator"] - x["min_our_soa_replication"]) / x["min_our_soa_replication"]
        ),
        variable_mutate_rate__ordered_crossover_p_gap=lambda x: (
            (x["min_our_change__variable_mutate_rate__ordered_crossover"] - x["min_our_soa_replication"]) / x["min_our_soa_replication"]
        ),
        all_changes_p_gap= lambda x: (
            (x["min_our_change__all_changes"] - x["min_our_soa_replication"]) / x["min_our_soa_replication"]
        )
    )
    [[
        "instance_name", "instance_symmetry", "k_factor", "has_closed_cycle",
        "k_size", "ordered_crossover_p_gap",
        "variable_mutate_rate_p_gap", "replace_operator_p_gap",
        "replace_operator__ordered_crossover_p_gap",
        "variable_mutate_rate__replace_operator_p_gap",
        "variable_mutate_rate__ordered_crossover_p_gap", "all_changes_p_gap"
    ]]
)

In [58]:
changes_p_gap.groupby(["instance_name"]).agg(
    avg_ordered_crossover_p_gap=("ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate_p_gap=("variable_mutate_rate_p_gap", "mean"),
    avg_replace_operator_p_gap=("replace_operator_p_gap", "mean"),
    avg_replace_operator__ordered_crossover_p_gap=("replace_operator__ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate__replace_operator_p_gap=("variable_mutate_rate__replace_operator_p_gap", "mean"),
    avg_variable_mutate_rate__ordered_crossover_p_gap=("variable_mutate_rate__ordered_crossover_p_gap", "mean"),
    avg_all_changes_p_gap=("all_changes_p_gap", "mean")
).reset_index()

Unnamed: 0,instance_name,avg_ordered_crossover_p_gap,avg_variable_mutate_rate_p_gap,avg_replace_operator_p_gap,avg_replace_operator__ordered_crossover_p_gap,avg_variable_mutate_rate__replace_operator_p_gap,avg_variable_mutate_rate__ordered_crossover_p_gap,avg_all_changes_p_gap
0,a280,-0.000261,-0.001873,-0.004377,-0.000261,-0.004377,-0.001655,-0.001788
1,ali535,-0.002867,-0.003185,-0.001855,-0.001484,-0.001855,-0.000507,-0.001484
2,att48,-0.015289,-0.008880,-0.002943,-0.015289,-0.002943,-0.010408,-0.016104
3,att532,-0.010660,0.000000,0.000000,-0.009121,0.000000,-0.010660,-0.009121
4,bayg29,-0.001088,-0.002297,-0.007833,-0.001088,-0.007833,-0.001001,-0.006862
...,...,...,...,...,...,...,...,...
90,u159,-0.004881,0.000000,0.000000,-0.004881,0.000000,-0.004881,-0.004881
91,u574,-0.000509,0.000000,-0.000441,-0.000509,-0.000441,-0.000509,-0.000509
92,u724,-0.000291,0.000000,0.000000,-0.000291,0.000000,-0.000291,-0.000291
93,ulysses16,0.003061,0.000000,0.000000,0.003061,0.000000,0.000000,0.003684


In [60]:
changes_p_gap.groupby(["instance_symmetry"]).agg(
    avg_ordered_crossover_p_gap=("ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate_p_gap=("variable_mutate_rate_p_gap", "mean"),
    avg_replace_operator_p_gap=("replace_operator_p_gap", "mean"),
    avg_replace_operator__ordered_crossover_p_gap=("replace_operator__ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate__replace_operator_p_gap=("variable_mutate_rate__replace_operator_p_gap", "mean"),
    avg_variable_mutate_rate__ordered_crossover_p_gap=("variable_mutate_rate__ordered_crossover_p_gap", "mean"),
    avg_all_changes_p_gap=("all_changes_p_gap", "mean")
).reset_index()

Unnamed: 0,instance_symmetry,avg_ordered_crossover_p_gap,avg_variable_mutate_rate_p_gap,avg_replace_operator_p_gap,avg_replace_operator__ordered_crossover_p_gap,avg_variable_mutate_rate__replace_operator_p_gap,avg_variable_mutate_rate__ordered_crossover_p_gap,avg_all_changes_p_gap
0,ATSP,-0.017863,-0.00337,-0.027599,-0.019779,-0.027599,-0.016661,-0.020136
1,TSP,-0.007887,-0.001405,-0.004189,-0.008011,-0.004189,-0.007827,-0.008044


In [62]:
changes_p_gap.groupby(["k_factor"]).agg(
    avg_ordered_crossover_p_gap=("ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate_p_gap=("variable_mutate_rate_p_gap", "mean"),
    avg_replace_operator_p_gap=("replace_operator_p_gap", "mean"),
    avg_replace_operator__ordered_crossover_p_gap=("replace_operator__ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate__replace_operator_p_gap=("variable_mutate_rate__replace_operator_p_gap", "mean"),
    avg_variable_mutate_rate__ordered_crossover_p_gap=("variable_mutate_rate__ordered_crossover_p_gap", "mean"),
    avg_all_changes_p_gap=("all_changes_p_gap", "mean")
).reset_index()

Unnamed: 0,k_factor,avg_ordered_crossover_p_gap,avg_variable_mutate_rate_p_gap,avg_replace_operator_p_gap,avg_replace_operator__ordered_crossover_p_gap,avg_variable_mutate_rate__replace_operator_p_gap,avg_variable_mutate_rate__ordered_crossover_p_gap,avg_all_changes_p_gap
0,0.25,-0.0086,-0.001577,-0.011093,-0.009592,-0.011093,-0.008846,-0.009723
1,0.5,-0.008106,-0.001839,-0.006585,-0.008624,-0.006585,-0.007554,-0.008968
2,0.75,-0.012843,-0.001959,-0.00876,-0.012771,-0.00876,-0.0123,-0.012588


In [64]:
changes_p_gap.groupby(["has_closed_cycle"]).agg(
    avg_ordered_crossover_p_gap=("ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate_p_gap=("variable_mutate_rate_p_gap", "mean"),
    avg_replace_operator_p_gap=("replace_operator_p_gap", "mean"),
    avg_replace_operator__ordered_crossover_p_gap=("replace_operator__ordered_crossover_p_gap", "mean"),
    avg_variable_mutate_rate__replace_operator_p_gap=("variable_mutate_rate__replace_operator_p_gap", "mean"),
    avg_variable_mutate_rate__ordered_crossover_p_gap=("variable_mutate_rate__ordered_crossover_p_gap", "mean"),
    avg_all_changes_p_gap=("all_changes_p_gap", "mean")
).reset_index()

Unnamed: 0,has_closed_cycle,avg_ordered_crossover_p_gap,avg_variable_mutate_rate_p_gap,avg_replace_operator_p_gap,avg_replace_operator__ordered_crossover_p_gap,avg_variable_mutate_rate__replace_operator_p_gap,avg_variable_mutate_rate__ordered_crossover_p_gap,avg_all_changes_p_gap
0,False,-0.009516,-0.001281,-0.007609,-0.009608,-0.007609,-0.008053,-0.009381
1,True,-0.010192,-0.002304,-0.010001,-0.011055,-0.010001,-0.011085,-0.011477


In [224]:
changes_p_gap[[
    "ordered_crossover_p_gap", "variable_mutate_rate_p_gap",
    "replacement_operator_p_gap", "replacement_operator__ordered_crossover_p_gap",
    "variable_mutate_rate__replacement_operator_p_gap",
    "variable_mutate_rate__ordered_crossover_p_gap", "all_changes_p_gap"
]].mean().sort_values(ascending=True)

model_name
replacement_operator__ordered_crossover_p_gap      -0.005187
ordered_crossover_p_gap                            -0.004981
replacement_operator_p_gap                         -0.003878
variable_mutate_rate__replacement_operator_p_gap   -0.003878
all_changes_p_gap                                  -0.003616
variable_mutate_rate__ordered_crossover_p_gap      -0.003362
variable_mutate_rate_p_gap                          0.000000
dtype: float64