In [169]:
import json
import pandas as pd
import warnings
import seaborn as sns
from pyprojroot import here
import plotly.express as px
import numpy as np
import math

methods = ["JustCopy", "TimeGAN", "Time-Transformer", "TransFusion", "TTS-GAN", "TimeVQVAE"]
datasets = ["D2", "D3", "D4", "D5", "D6", "D7"]

rows = []

for method in methods:
    for dataset in datasets:
        file_candidates = list(here('result').glob(f'numeric_{method}_{dataset}_*.json'))
        if len(file_candidates) != 1:
            warnings.warn(f"Ignoring {method} {dataset}: Expected one result file for {method} {dataset}, instead matched {file_candidates}.")
            continue
        file_path = file_candidates[0]
        
        with open(file_path) as f:
            data = json.load(f)
        
        rows.append(((method, dataset), data))

df_all = pd.DataFrame.from_dict(dict(rows), orient="index")
df_all.index = pd.MultiIndex.from_tuples(df_all.index, names=["Method", "Dataset"])

# load timings
def timings_path(method):
    paths = list(here("models").glob(f"*{method}*/timings.csv"))
    assert len(paths) == 1
    return paths[0]

timings = pd.DataFrame()
for method in methods:
    df = pd.read_csv(timings_path(method))
    df["Method"] = method
    timings = pd.concat([timings, df], ignore_index=True)

_timings = timings.melt(id_vars="Method", var_name="Dataset", value_name="Time").set_index(["Method", "Dataset"]).sort_index(level="Method")

measures_order = ["PS", "DS", "C-FID","MDD", "ACD",	"SD", "KD", "ED", "DTW", "Time"]
ranking_order = ["JustCopy", "TransFusion", "TimeVQVAE", "Time-Transformer", "TimeGAN", "TTS-GAN"]

def sortdf(df):
    return df.sort_index(level=0, key=lambda x: pd.Categorical(x, categories=ranking_order, ordered=True))

df_all = pd.concat([df_all, _timings], axis = 1).loc[ranking_order][measures_order]

In [170]:
import re

cm = sns.diverging_palette(h_neg=130, h_pos=0, as_cmap=True)


def style_results_table(df, separate_models=False):
    sci_columns = {"ED", "DTW", "C-FID"}
    decimal_columns = set([col[0] if isinstance(col, tuple) else col for col in df.columns]) - sci_columns

    styled = df.style

    # === Formatting === #

    import math
    import pandas as pd

    def decimal_fmt(x, sig=3):
        if pd.isna(x):
            return ""
        if x == 0:
            return "0"
        digits = sig - int(math.floor(math.log10(abs(x)))) - 1
        rounded = round(x, max(digits, 0))
        s = f"{rounded:.{max(digits,0)}f}"
        s = re.sub('\.0', '', s)
        return s

    def sci_fmt_unicode(x):
        if pd.isna(x):
            return ""
        if x == 0:
            return "0"
        exp = math.floor(math.log10(abs(x)))
        coeff = x / 10**exp
        if exp in {0,1,-1}:
            return f"{coeff:.3g}"
        superscripts = str.maketrans("0123456789-", "⁰¹²³⁴⁵⁶⁷⁸⁹⁻")
        return f"{coeff:.3g}×10{str(exp).translate(superscripts)}"

    fmt = {}

    for col in df.columns:
        colname = col[0] if isinstance(col, tuple) else col
        if colname in sci_columns:
            fmt[col] = sci_fmt_unicode
        elif colname in decimal_columns:
            fmt[col] = decimal_fmt

    styled = styled.format(fmt)

    # === Background Gradients === #

    ranges = {
        "DS": (0.012, 0.476),
        "PS": (0.113, 0.279),
        "C-FID": (0, 1),
        "MDD": (0.0231, 1.05),
        "ACD": (0, 1),
        "SD": (0, 1),
        "KD": (0, 2.16),
        "ED": (0, 4),
        "DTW": (0, 14),
        "Time": (0, 961),
    }

    for col in df.columns:
        colname = col[0] if isinstance(col, tuple) else col
        if colname in ranges:
            vmin, vmax = ranges[colname]
            styled = styled.background_gradient(
                cmap=cm,
                vmin=vmin,
                vmax=vmax,
                subset=[col]   # important: now accepts MultiIndex column keys
            )


    if separate_models:
        level0 = df.index.get_level_values(0)
        boundaries = np.where(level0[:-1] != level0[1:])[0]

        def highlight_boundaries(row):
            i = df.index.get_loc(row.name)  # positional index
            styles = [''] * row.size
            if (i - 1) in boundaries:
                styles = ['border-top: 2px solid white'] * row.size
            return styles

        # --- apply to styler object ---
        styled = styled.apply(highlight_boundaries, axis=1)

    return styled



def group_measures(df):
    category_map = {
    "Utility": ["PS"],
    "Fidelity": ["DS", "C-FID", "MDD", "ACD", "SD", "KD", "ED", "DTW"],
    "Training Efficiency": ["Time"]
    }

    # Create a list of tuples (top-level, column name)
    new_cols = []
    for cat, cols in category_map.items():
        for col in cols:
            new_cols.append((cat, col))

    # Reorder df columns to match the new structure
    df = df[[col for _, col in new_cols]]

    # Apply MultiIndex to columns
    df.columns = pd.MultiIndex.from_tuples(new_cols)
    return df

df_no_copy = df_all.drop(index="JustCopy", level="Method")
tab_full_results = style_results_table(df_all, separate_models=True)
tab_full_results

Unnamed: 0_level_0,Unnamed: 1_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Method,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
JustCopy,D2,377.0,101.0,-1.67×10⁻¹²,317.0,0.0,0.0,0.0,0,0,0
JustCopy,D3,367.0,101.0,-2.12×10⁻¹²,282.0,0.0,0.0,0.0,0,0,0
JustCopy,D4,571.0,138.0,-2.43×10⁻¹³,264.0,0.0,0.0,0.0,0,0,0
JustCopy,D5,0.25,167.0,-2.42×10⁻¹⁵,607.0,0.0,0.0,0.0,0,0,0
JustCopy,D6,0.248,992.0,-3.29×10⁻¹⁵,711.0,0.0,0.0,0.0,0,0,0
JustCopy,D7,493.0,116.0,4.34×10⁻¹⁵,579.0,0.0,0.0,0.0,0,0,0
TransFusion,D2,398.0,0.147,1.03,0.794,336.0,0.533,2.63,1.07,2.77,43
TransFusion,D3,378.0,0.119,5.48×10⁻³,0.405,0.113,0.166,0.811,2.68,6.83,222
TransFusion,D4,549.0,314.0,1.04×10⁻²,0.292,402.0,0.118,0.243,2.81,8.95,454
TransFusion,D5,0.251,0.11,1.03×10⁻²,0.224,633.0,0.106,0.421,1.01,6.28,201


In [171]:
with open("full_results_table.tex", "w") as f:
    f.write(tab_full_results.to_latex())

In [172]:
def merge_datasets_mean(df, datasets_to_merge=[], new_name="D2-D5"):
    # Select rows to merge
    to_merge = df.loc[pd.IndexSlice[:, datasets_to_merge], :]
    
    # Compute mean grouped by Method
    merged = to_merge.groupby(level=0).mean()
    merged.index = pd.MultiIndex.from_product([merged.index, [new_name]], names=df.index.names)
    
    # Drop original 
    df_dropped = df.drop(datasets_to_merge, level=1)
    
    # Concatenate
    df_new = pd.concat([df_dropped, merged]).sort_index(level=0, key=lambda x: pd.Categorical(x, categories=ranking_order, ordered=True))

    df_new.index = df_new.index.rename(['Method','Seq. Length'])
    
    return df_new

def long_vs_short(df):
    return merge_datasets_mean(merge_datasets_mean(df, ["D2","D5"], "short (24)"), ["D3", "D6"], "long (128)").drop(level=1, index="D4").drop(level=1, index="D7")

df_long_vs_short = long_vs_short(df_all)
tab_long_vs_short = style_results_table(df_long_vs_short, separate_models=True)
tab_long_vs_short

Unnamed: 0_level_0,Unnamed: 1_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Method,Seq. Length,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
JustCopy,long (128),0.142,100.0,-1.06×10⁻¹²,357.0,0.0,0.0,0.0,0,0,0.0
JustCopy,short (24),0.144,134.0,-8.38×10⁻¹³,305.0,0.0,0.0,0.0,0,0,0.0
TransFusion,long (128),0.144,0.175,4.69×10⁻³,0.302,0.112,0.102,0.473,2.61,1.07,804.0
TransFusion,short (24),0.145,0.129,5.67×10⁻²,0.509,485.0,0.32,1.53,1.04,4.53,122.0
TimeVQVAE,long (128),0.173,0.381,2.88,0.534,0.66,0.294,1.39,2.37,9.92,91.0
TimeVQVAE,short (24),0.203,0.385,2.89,0.556,0.462,0.336,1.53,1.09,4.65,73.5
Time-Transformer,long (128),0.162,0.399,3.54,0.681,0.595,0.475,25.0,2.47,1.04,46.5
Time-Transformer,short (24),0.205,0.412,2.14,0.509,0.257,0.394,1.71,1.13,4.68,39.5
TimeGAN,long (128),0.176,0.362,3.4,0.803,1.24,0.411,1.64,2.63,1.07,1312.0
TimeGAN,short (24),0.162,0.288,7.76×10⁻²,0.412,0.314,0.369,1.69,1.12,4.75,282.0


In [173]:
def long_or_short(df):
    df = long_vs_short(df)
    out = pd.DataFrame(index=df.index.levels[0], columns=df.columns)

    for lvl1 in df.index.levels[0]:
        subset = df.xs(lvl1, level=0)
        min_idx = subset.idxmin()
        out.loc[lvl1] = min_idx


    def color_cells(value):
        if value == "long (128)":
            return 'background-color: lightseagreen'
        if value == "short (24)":
            return 'background-color: lightblue'

    return out.style.applymap(color_cells)

tab_long_or_short = long_or_short(df_all.drop(index="JustCopy").drop(index="TTS-GAN"))
tab_long_or_short


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Time-Transformer,long (128),long (128),short (24),short (24),short (24),short (24),short (24),short (24),short (24),short (24)
TimeGAN,short (24),short (24),short (24),short (24),short (24),short (24),long (128),short (24),short (24),short (24)
TimeVQVAE,long (128),long (128),long (128),long (128),short (24),long (128),long (128),short (24),short (24),short (24)
TransFusion,long (128),short (24),long (128),long (128),short (24),long (128),long (128),short (24),short (24),short (24)


In [174]:
def long_minus_short(df):
    out = long_vs_short(df)

    out = out.groupby(level=0).diff()

    return out.loc[pd.IndexSlice[:, 'short (24)'], :]

style_results_table(
    long_minus_short(
        df_all
            .drop(index="JustCopy", level=0)
            .drop(index="TTS-GAN", level=0)
        ))

Unnamed: 0_level_0,Unnamed: 1_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Method,Seq. Length,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TransFusion,short (24),997,-468,5.2×10⁻²,0.207,-636.0,0.218,15.0,-1.57,-6.22,-682.0
TimeVQVAE,short (24),305,416,6.65×10⁻⁴,220.0,-0.198,411.0,0.143,-1.28,-5.27,-17.5
Time-Transformer,short (24),437,133,-1.4,-0.172,-0.339,-808.0,-0.341,-1.34,-5.71,-70.0
TimeGAN,short (24),-133,-747,-2.63,-0.391,-0.925,-420.0,546.0,-1.51,-5.93,-1031.0


In [175]:
style_results_table(df_all.groupby(level=0).agg(["min","max", "mean"]).loc[ranking_order])

Unnamed: 0_level_0,PS,PS,PS,DS,DS,DS,C-FID,C-FID,C-FID,MDD,MDD,MDD,ACD,ACD,ACD,SD,SD,SD,KD,KD,KD,ED,ED,ED,DTW,DTW,DTW,Time,Time,Time
Unnamed: 0_level_1,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
JustCopy,367,0.25,0.113,992.0,167.0,120.0,-2.12×10⁻¹²,4.34×10⁻¹⁵,-6.73×10⁻¹³,264.0,711.0,231.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0.0
TransFusion,378,0.251,0.114,314.0,0.231,0.115,3.89×10⁻³,1.03,2.41×10⁻²,0.152,0.794,0.344,336.0,0.373,0.122,376.0,0.533,0.176,665.0,2.63,0.718,1.01,2.81,2.1,2.77,1.47,8.13,43,1387,527.0
TimeVQVAE,421,0.364,0.146,0.266,0.499,0.379,4.46×10⁻²,5.24,2.3,0.415,0.601,0.509,0.107,15.0,0.551,723.0,0.557,0.315,0.152,21.0,1.27,1.01,2.75,1.99,2.99,1.37,7.73,73,94,84.8
Time-Transformer,588,0.325,0.143,0.259,0.5,0.396,5×10⁻²,5.23,2.11,0.304,0.747,0.534,0.247,0.957,0.495,0.207,0.582,0.378,0.377,2.88,1.41,9.5,2.67,2.03,3.38,1.45,7.91,13,77,42.5
TimeGAN,379,0.312,0.143,934.0,0.5,0.286,2.01×10⁻²,6.27,2.49,0.3,0.865,0.583,0.128,2.33,0.717,0.197,0.549,0.346,0.498,2.37,1.31,1.06,2.87,2.14,3.05,1.47,8.22,259,1385,961.0
TTS-GAN,386,0.324,0.145,0.164,0.5,0.406,1.19,2.1×10¹¹,3.49×10¹⁰,0.436,1.39,0.78,0.135,8.1,2.1,0.335,11.0,0.596,0.534,4.28,1.84,9.6,4.45×10⁵,7.41×10⁴,2.5,2.67×10⁶,4.44×10⁵,659,762,696.0


In [176]:
style_results_table(df_all.groupby(level=0).quantile([.25,.5,.75]).unstack(level=1).loc[ranking_order])

Unnamed: 0_level_0,PS,PS,PS,DS,DS,DS,C-FID,C-FID,C-FID,MDD,MDD,MDD,ACD,ACD,ACD,SD,SD,SD,KD,KD,KD,ED,ED,ED,DTW,DTW,DTW,Time,Time,Time
Unnamed: 0_level_1,0.25,0.5,0.75,0.25,0.5,0.75,0.25,0.5,0.75,0.25,0.5,0.75,0.25,0.5,0.75,0.25,0.5,0.75,0.25,0.5,0.75,0.25,0.5,0.75,0.25,0.5,0.75,0.25,0.5,0.75
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
JustCopy,406,532,0.2,101.0,109.0,132.0,-1.32×10⁻¹²,-1.23×10⁻¹³,-2.64×10⁻¹⁵,291.0,306.0,469.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
TransFusion,423,524,0.202,649.0,0.115,0.14,6.69×10⁻³,1.04×10⁻²,1.11×10⁻²,0.205,0.258,0.377,460.0,873.0,0.113,0.1,0.112,0.154,0.163,0.332,0.713,1.43,2.53,2.64,6.42,7.89,9.21,206,338.0,756.0
TimeVQVAE,512,622,0.239,0.284,0.371,0.477,6.04×10⁻²,1.35,4.14,0.463,0.493,0.574,0.182,0.524,0.912,0.22,0.336,0.389,1.1,1.39,1.61,1.43,2.3,2.39,6.2,7.41,8.68,77,87.0,92.5
Time-Transformer,672,785,0.211,0.304,0.41,0.499,1.02,1.73,2.46,0.421,0.568,0.626,0.277,0.398,0.648,0.263,0.344,0.504,0.648,13.0,2.24,1.57,2.4,2.47,6.06,7.36,8.77,19,41.5,63.2
TimeGAN,461,920,0.245,0.204,0.22,0.418,6.68×10⁻²,1.2,4.49,0.397,0.592,0.763,0.152,0.314,0.916,0.267,0.294,0.434,0.779,0.966,22.0,1.51,2.54,2.63,6.51,7.89,9.3,527,1224.0,1343.0
TTS-GAN,660,855,0.227,0.383,0.451,0.494,2.22,4.19,1.99,0.589,0.628,0.924,0.689,1.19,1.39,0.42,0.458,0.791,0.812,1.41,2.43,1.36,2.64,3.74,6.62,9.08,1.03,676,693.0,696.0


In [177]:
style_results_table(
    df_all
        .drop(index="TTS-GAN", level="Method")
        .drop(index="JustCopy", level="Method")
        .groupby(level="Dataset").agg(["min", "max", "mean"])
        .sort_values(("DS","mean"))
    )

Unnamed: 0_level_0,PS,PS,PS,DS,DS,DS,C-FID,C-FID,C-FID,MDD,MDD,MDD,ACD,ACD,ACD,SD,SD,SD,KD,KD,KD,ED,ED,ED,DTW,DTW,DTW,Time,Time,Time
Unnamed: 0_level_1,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean,min,max,mean
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2
D4,549.0,0.119,749.0,314.0,0.323,0.203,1.04×10⁻²,1.05,5.24×10⁻²,0.292,0.77,0.511,402.0,0.307,0.146,723.0,0.261,0.162,0.152,0.498,0.317,2.67,2.87,2.77,8.42,9.1,8.8,28,1195,443
D2,379.0,857.0,514.0,934.0,0.324,0.209,2.01×10⁻²,2.66,1.19,0.443,0.794,0.617,336.0,0.247,0.152,0.3,0.582,0.472,21.0,2.63,2.4,1.07,1.31,1.18,2.77,3.38,3.05,13,259,97
D3,378.0,713.0,493.0,0.119,0.298,0.227,5.48×10⁻³,1.85,7.41×10⁻²,0.405,0.865,0.653,0.113,0.49,0.256,0.166,0.533,0.291,0.811,2.88,1.9,2.4,2.68,2.55,6.16,6.83,6.49,16,1373,426
D7,500.0,653.0,596.0,499.0,0.496,0.295,1.13×10⁻²,5.54,2.08,0.152,0.415,0.293,0.373,17.0,0.84,980.0,0.557,0.31,665.0,1.62,0.751,2.26,2.51,2.4,8.51,9.37,9.01,55,1385,596
D5,0.251,0.364,0.307,0.11,0.5,0.398,1.03×10⁻²,4.91,1.99,0.224,0.511,0.376,633.0,0.772,0.389,0.106,0.371,0.237,0.421,16.0,0.833,9.5,1.06,1.01,5.98,6.46,6.26,66,304,161
D6,0.251,0.312,0.278,0.231,0.5,0.432,3.89×10⁻³,6.27,4.19,0.199,0.742,0.507,0.111,2.33,15.0,376.0,0.549,0.35,0.136,1.22,0.872,2.34,2.61,2.48,1.37,1.47,1.44,77,1387,701


In [178]:
style_results_table(
    df_all
        .groupby(level="Method").mean()
        .loc[ranking_order]
    )

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
JustCopy,0.113,120.0,-6.73×10⁻¹³,231.0,0.0,0.0,0.0,0,0,0.0
TransFusion,0.114,0.115,2.41×10⁻²,0.344,0.122,0.176,0.718,2.1,8.13,527.0
TimeVQVAE,0.146,0.379,2.3,0.509,0.551,0.315,1.27,1.99,7.73,84.8
Time-Transformer,0.143,0.396,2.11,0.534,0.495,0.378,1.41,2.03,7.91,42.5
TimeGAN,0.143,0.286,2.49,0.583,0.717,0.346,1.31,2.14,8.22,961.0
TTS-GAN,0.145,0.406,3.49×10¹⁰,0.78,2.1,0.596,1.84,7.41×10⁴,4.44×10⁵,696.0


In [179]:
style_results_table(df_all.groupby("Method").mean()["PS"].to_frame().sort_values("PS"))

Unnamed: 0_level_0,PS
Method,Unnamed: 1_level_1
JustCopy,0.113
TransFusion,0.114
Time-Transformer,0.143
TimeGAN,0.143
TTS-GAN,0.145
TimeVQVAE,0.146


In [180]:
style_results_table(df_all.groupby("Method").mean()["Time"].to_frame().sort_values("Time").sort_values("Time"))

Unnamed: 0_level_0,Time
Method,Unnamed: 1_level_1
JustCopy,0.0
Time-Transformer,42.5
TimeVQVAE,84.8
TransFusion,527.0
TTS-GAN,696.0
TimeGAN,961.0


In [181]:
style_results_table(
    df_all
        .groupby("Method").mean()
        .drop(columns=["Time", "PS"])
        .loc[ranking_order]
        )

Unnamed: 0_level_0,DS,C-FID,MDD,ACD,SD,KD,ED,DTW
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
JustCopy,120.0,-6.73×10⁻¹³,231.0,0.0,0.0,0.0,0,0
TransFusion,0.115,2.41×10⁻²,0.344,0.122,0.176,0.718,2.1,8.13
TimeVQVAE,0.379,2.3,0.509,0.551,0.315,1.27,1.99,7.73
Time-Transformer,0.396,2.11,0.534,0.495,0.378,1.41,2.03,7.91
TimeGAN,0.286,2.49,0.583,0.717,0.346,1.31,2.14,8.22
TTS-GAN,0.406,3.49×10¹⁰,0.78,2.1,0.596,1.84,7.41×10⁴,4.44×10⁵


## Spider Plots

In [182]:
rankings = df_no_copy.unstack(level=1).rank()

ranking_by_metric = rankings.stack().reset_index().groupby("Method").mean()
spider_by_metric = ranking_by_metric.reset_index().melt(id_vars="Method")
fig_spider_by_metric = px.line_polar(spider_by_metric, r="value", theta="variable", color="Method", line_close=True) \
    .update_layout(polar={"radialaxis": {"range": [5.9, 1], "dtick": 1}})
fig_spider_by_metric

In [183]:
ranking_by_dataset = rankings.stack(level=0).reset_index().groupby("Method").mean()
spider_by_dataset = ranking_by_dataset.reset_index().melt(id_vars="Method")
fig_spider_by_dataset = px.line_polar(spider_by_dataset, r="value", theta="Dataset", color="Method", line_close=True) \
    .update_layout(polar={"radialaxis": {"range": [5.9,1], "dtick": 1}})
fig_spider_by_dataset

In [184]:
import plotly.graph_objects as go
import scikit_posthocs as sp

def conover_test(df):
    df = df.unstack(level=1).rank().T.reset_index().melt(id_vars=['level_0', 'Dataset'], var_name='Method', value_name='Rank')
    posthoc = sp.posthoc_conover(df, group_col='Method', val_col='Rank', p_adjust='bonferroni')

    return posthoc

def pretty_conover(df):
    posthoc = conover_test(df)

    p_threshold = 0.05 / len(posthoc.columns)
    print(p_threshold)

    def highlight_below_threshold(val):
        color = 'background-color: turquoise' if val < p_threshold else ''
        return color

    return posthoc.style.applymap(highlight_below_threshold).format("{:.2e}")

def plot_on_number_line(s, title, axis_label, connections=[], textpositions=["top center","top center","top center","top center","top center"], range_max=None, range_min=None):
    colors = [color for _, color in zip(methods, px.colors.qualitative.Plotly)]

    fig = go.Figure(go.Scatter(
        x=s.values,
        y=[0]*len(s),
        mode="markers+text",
        text=s.index,
        textfont=dict(color=colors),
        textposition=textpositions,
        marker=dict(size=12, color=colors)
    ))

    if not range_max:
        range_max = s.max()
    if not range_min:
        range_min = s.min()
    h_margin = 0.1*(range_max-range_min)

    for i, (m1, m2) in enumerate(connections):
        if i%2==0:
            y_offset = 0.1
        else:
            y_offset = -0.1
        gray_shade = 50 + 200 * (i/len(connections))
        line_color = f"rgb(50,50,50)" #colors[i] #f"rgb({gray_shade},{gray_shade},{gray_shade})"
        fig.add_shape(
            type="line",
            x0=s[m1], x1=s[m2],
            y0=y_offset, y1=y_offset,
            line=dict(color=line_color, width=2),
            layer="below"
        )

    # Thin horizontal number line
    fig.add_shape(type="line",
                x0=range_min, x1=range_max,
                y0=0, y1=0,
                line=dict(color="black", width=1),
                layer="below")
    
    # Start line
    fig.add_shape(type="line",
                x0=range_min, x1=range_min,
                y0=-1, y1=1,
                line=dict(color="white", width=1),
                layer="below")

    # Layout tweaks for minimal look
    fig.update_yaxes(visible=False)
    fig.update_xaxes(range=[range_min-h_margin, range_max+h_margin], showgrid=True, zeroline=False)
    fig.update_layout(
        height=200,
        xaxis_title=axis_label,
        yaxis_title="",
        showlegend=False,
        margin=dict(t=60, b=60, l=150, r=150),
        title=title,
    )

    fig.update_layout(width=1000, height=210)
    #fig.show()
    fig.update_layout(
        title=None,
        margin=dict(t=60, b=60, l=50, r=50)
    )
    fig.write_image(f"{title}.pdf")
    return fig


def plot_average_rankings(df, title, connections=[], textpositions=["top center","bottom center","top center","top center","top center"]):
    s = rank(df).mean(axis=1)

    range_max = len(df.index.get_level_values(level="Method").unique())

    return plot_on_number_line(s, title=title, axis_label="Average Rankings", connections=connections, textpositions=textpositions, range_min=1, range_max=range_max)

def rank(df):
    return df.unstack(level=1).rank()



In [185]:
just_ranks = df_all.unstack(level=1).rank().transpose().reset_index(drop=True)
import plotly.express as px

rank_histogram = px.histogram(just_ranks, nbins=6, labels={'value':'rank'})
rank_histogram

In [186]:
connections = [("Time-Transformer", "TimeVQVAE"), ("TimeGAN", "Time-Transformer"), ("TimeGAN", "TTS-GAN")] # according to connover test.
all_measures_ranking = plot_average_rankings(df_no_copy, connections=connections, title="All Measures")
all_measures_ranking

In [187]:
pretty_conover(df_no_copy)

0.01


Unnamed: 0,TTS-GAN,Time-Transformer,TimeGAN,TimeVQVAE,TransFusion
TTS-GAN,1.0,1.03e-05,0.055,5.45e-07,2.56e-16
Time-Transformer,1.03e-05,1.0,0.29,1.0,0.00073
TimeGAN,0.055,0.29,1.0,0.0573,1.72e-08
TimeVQVAE,5.45e-07,1.0,0.0573,1.0,0.00679
TransFusion,2.56e-16,0.00073,1.72e-08,0.00679,1.0


In [188]:
connections=[("TimeVQVAE", "TimeGAN")]
plot_average_rankings(
    df_all
        .drop(index="JustCopy", level="Method")
        .drop(columns=["PS", "Time"]), 
    title="Fidelity",
    textpositions=["top center","bottom center","top left","top center","top center"],
    connections=connections)

In [189]:
pretty_conover(df_no_copy.drop(columns=["PS", "Time"]))

0.01


Unnamed: 0,TTS-GAN,Time-Transformer,TimeGAN,TimeVQVAE,TransFusion
TTS-GAN,1.0,0.000617,0.0173,3.57e-06,5.45e-15
Time-Transformer,0.000617,1.0,1.0,1.0,6.01e-05
TimeGAN,0.0173,1.0,1.0,0.396,8.03e-07
TimeVQVAE,3.57e-06,1.0,0.396,1.0,0.00616
TransFusion,5.45e-15,6.01e-05,8.03e-07,0.00616,1.0


In [190]:
connections=[("TTS-GAN", "TimeGAN"), ("TimeVQVAE", "TransFusion")]
plot_average_rankings(
    df_no_copy["PS"].to_frame(), 
    connections=connections, 
    title="Utility", 
    textpositions=["top center","top right","bottom right","top center","top center"]
)

In [191]:
pretty_conover(df_no_copy["PS"].to_frame())

0.01


Unnamed: 0,TTS-GAN,Time-Transformer,TimeGAN,TimeVQVAE,TransFusion
TTS-GAN,1.0,1.0,1.0,1.0,0.0318
Time-Transformer,1.0,1.0,1.0,1.0,0.0561
TimeGAN,1.0,1.0,1.0,1.0,0.167
TimeVQVAE,1.0,1.0,1.0,1.0,0.0977
TransFusion,0.0318,0.0561,0.167,0.0977,1.0


In [192]:
from scipy.stats import friedmanchisquare

df = df_no_copy.unstack(level=1).T
ranks = df.rank(axis=0, method='average', ascending=False)

stat, p = friedmanchisquare(*[ranks.loc[m].values for m in ranks.index])
p

4.416202563276414e-27

In [193]:
import pandas as pd

def leaderboard(df):
    out = pd.DataFrame(index=df.index.levels[1], columns=df.columns)

    for lvl2 in df.index.levels[1]:
        subset = df.xs(lvl2, level=1)
        min_idx = subset.idxmin()
        out.loc[lvl2] = min_idx


    def color_cells(value):
        if value == "TTS-GAN":
            return 'background-color: yellow'
        if value == "TransFusion":
            return 'background-color: lightgreen'
        if value == "TimeGAN":
            return 'background-color: lightblue'
        if value == "Time-Transformer":
            return 'background-color: pink'
        if value == "TimeVQVAE":
            return 'background-color: turquoise'
        if value == "JustCopy":
            return 'background-color: lightseagreen'

    return out.style.applymap(color_cells)

def worst_board(df):
    out = pd.DataFrame(index=df.index.levels[1], columns=df.columns)

    for lvl2 in df.index.levels[1]:
        subset = df.xs(lvl2, level=1)
        max_idx = subset.idxmax()
        out.loc[lvl2] = max_idx


    def color_cells(value):
        if value == "TTS-GAN":
            return 'background-color: yellow'
        if value == "TransFusion":
            return 'background-color: lightgreen'
        if value == "TimeGAN":
            return 'background-color: lightblue'
        if value == "Time-Transformer":
            return 'background-color: pink'
        if value == "TimeVQVAE":
            return 'background-color: turquoise'
        if value == "JustCopy":
            return 'background-color: lightseagreen'

    return out.style.applymap(color_cells)


leaderboard(df_all)


Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D3,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D4,TransFusion,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D5,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D6,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D7,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy


In [194]:
leaderboard(df_no_copy)

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TransFusion,TimeVQVAE,TimeVQVAE,TTS-GAN,TTS-GAN,Time-Transformer
D3,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer
D4,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer,Time-Transformer,Time-Transformer
D5,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,Time-Transformer,Time-Transformer,Time-Transformer
D6,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer
D7,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer


In [195]:
leaderboard(df_all[df_all.index.get_level_values(level=0).isin(["TTS-GAN", "TimeGAN"])])

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN,TTS-GAN,TimeGAN,TTS-GAN,TTS-GAN,TimeGAN
D3,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN
D4,TTS-GAN,TimeGAN,TimeGAN,TTS-GAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN,TTS-GAN,TTS-GAN
D5,TTS-GAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN,TTS-GAN,TTS-GAN,TimeGAN
D6,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN
D7,TimeGAN,TimeGAN,TTS-GAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN,TimeGAN,TimeGAN,TTS-GAN


In [196]:
leaderboard(df_all[df_all.index.get_level_values(level=0).isin(["Time-Transformer", "TimeGAN"])])

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,Time-Transformer
D3,TimeGAN,TimeGAN,TimeGAN,Time-Transformer,TimeGAN,TimeGAN,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer
D4,Time-Transformer,TimeGAN,Time-Transformer,Time-Transformer,TimeGAN,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer
D5,TimeGAN,TimeGAN,TimeGAN,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer
D6,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer
D7,Time-Transformer,TimeGAN,Time-Transformer,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer


In [197]:
worst_board(df_all)

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,Time-Transformer,Time-Transformer,TTS-GAN,TransFusion,Time-Transformer,Time-Transformer,TTS-GAN,Time-Transformer,Time-Transformer,TTS-GAN
D3,Time-Transformer,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TimeGAN
D4,TimeGAN,TTS-GAN,TTS-GAN,TimeGAN,TTS-GAN,TTS-GAN,TTS-GAN,TimeGAN,TimeGAN,TimeGAN
D5,TimeVQVAE,Time-Transformer,TimeVQVAE,TTS-GAN,TimeVQVAE,TTS-GAN,TimeVQVAE,TimeGAN,TimeGAN,TTS-GAN
D6,TTS-GAN,Time-Transformer,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TransFusion
D7,TTS-GAN,Time-Transformer,TimeGAN,TTS-GAN,TTS-GAN,TimeVQVAE,TimeVQVAE,TTS-GAN,TTS-GAN,TimeGAN


In [198]:
rankings

Unnamed: 0_level_0,PS,PS,PS,PS,PS,PS,DS,DS,DS,DS,...,DTW,DTW,DTW,DTW,Time,Time,Time,Time,Time,Time
Dataset,D2,D3,D4,D5,D6,D7,D2,D3,D4,D5,...,D4,D5,D6,D7,D2,D3,D4,D5,D6,D7
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
TransFusion,3.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,4.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,5.0,4.0
TimeVQVAE,4.0,3.0,2.0,5.0,3.0,3.0,4.0,3.0,4.0,4.0,...,3.0,4.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0
Time-Transformer,5.0,5.0,3.0,4.0,2.0,2.0,5.0,4.0,3.0,5.0,...,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
TimeGAN,1.0,2.0,5.0,3.0,4.0,4.0,1.0,2.0,2.0,2.0,...,5.0,5.0,4.0,4.0,4.0,5.0,5.0,4.0,4.0,5.0
TTS-GAN,2.0,4.0,4.0,2.0,5.0,5.0,3.0,5.0,5.0,3.0,...,2.0,2.0,5.0,5.0,5.0,4.0,4.0,5.0,3.0,3.0


In [199]:
df = df_no_copy.unstack(level=1).rank()
utility_cols = [col for col in df.columns if col[0] in ["PS"]]

fidelity_cols = [col for col in df.columns if col[0] not in ["PS", "Time"]]

timing_cols = [col for col in df.columns if col[0] in ["Time"]]

avg_utility = df[utility_cols].mean(axis=1)
avg_fidelity = df[fidelity_cols].mean(axis=1)
avg_timing = df[timing_cols].mean(axis=1)

rankings_metrics_grouped = pd.DataFrame({
    "Utility": avg_utility,
    "Fidelity": avg_fidelity,
    "Efficiency": avg_timing
})

rankings_metrics_grouped.style.background_gradient(cmap=cm)

Unnamed: 0_level_0,Utility,Fidelity,Efficiency
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TransFusion,1.333333,1.875,3.333333
TimeVQVAE,3.333333,2.75,2.166667
Time-Transformer,3.5,3.041667,1.0
TimeGAN,3.166667,3.270833,4.5
TTS-GAN,3.666667,4.0625,4.0


In [200]:
fig_spider_utility_fidelity_efficiency = px.line_polar(rankings_metrics_grouped.reset_index().melt(id_vars="Method"), r='value', theta='variable', color="Method", line_close=True) \
    .update_layout(polar={"radialaxis": {"range": [5.9,1], "dtick": 1}})
fig_spider_utility_fidelity_efficiency


In [201]:
df = df_no_copy.unstack(level=1).rank()
model_based_cols = [col for col in df.columns if col[0] in ["PS", "DS"]]

feature_based_cols = [col for col in df.columns if col[0] not in ["PS", "Time", "DS", "DTW", "ED"]]

efficiency_based_cols = [col for col in df.columns if col[0] in ["Time"]]

distance_based_cols = [col for col in df.columns if col[0] in ["DTW", "ED"]]

avg_model_based = df[model_based_cols].mean(axis=1)
avg_feature_based = df[feature_based_cols].mean(axis=1)
avg_efficiency_based = df[efficiency_based_cols].mean(axis=1)
avg_distance_based = df[distance_based_cols].mean(axis=1)

rankings_tsgbench_metric_groups = pd.DataFrame({
    "Model Based": avg_model_based,
    "Feature Based": avg_feature_based,
    "Efficiency Based": avg_efficiency_based,
    "Distance Based": avg_distance_based
})

rankings_tsgbench_metric_groups.style.background_gradient(cmap=cm)

Unnamed: 0_level_0,Model Based,Feature Based,Efficiency Based,Distance Based
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TransFusion,1.25,1.466667,3.333333,3.25
TimeVQVAE,3.333333,2.866667,2.166667,2.166667
Time-Transformer,3.916667,3.133333,1.0,2.166667
TimeGAN,2.666667,3.166667,4.5,4.083333
TTS-GAN,3.833333,4.366667,4.0,3.333333


In [202]:
fig_spider_tsgbench_measure_groups = px.line_polar(rankings_tsgbench_metric_groups.reset_index().melt(id_vars="Method"), r='value', theta='variable', color="Method", line_close=True) \
    .update_layout(polar={"radialaxis": {"range": [5.9,1], "dtick": 1}})
fig_spider_tsgbench_measure_groups

In [203]:
timings.set_index("Method").drop(index="JustCopy")

Unnamed: 0_level_0,D2,D3,D4,D5,D6,D7
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TimeGAN,259,1373,1195,304,1252,1385
Time-Transformer,13,16,28,66,77,55
TransFusion,43,222,454,201,1387,856
TTS-GAN,672,696,762,696,690,659
TimeVQVAE,73,94,94,74,88,86


In [214]:
average_timings = timings.set_index("Method").mean(axis=1).to_frame(name="Average Wall Clock Time").drop(index="JustCopy")


fig_time_plot = plot_on_number_line(\
    average_timings["Average Wall Clock Time"], \
    title = "Average Wall Clock Time", \
    axis_label = "minutes", \
    range_min=1, \
    range_max=1000, \
    textpositions=["top center","bottom center","top center","bottom center","top center"])
fig_time_plot

# Visualizations for paper

## Benchmark Results

In [205]:
leaderboard(df_all)

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D3,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D4,TransFusion,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D5,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D6,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy
D7,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy,JustCopy


The overall leaderboard shows the best scoring method on each dataset in each measure. As is clearly visible, the simple "JustCopy" method, that emulates the behavioiur a perfectly overfitted model, scores best in all but one cases.

conclusion notes:
- This shows, that the benchmark does not have a metric to measure a key aspect of a good generator: novelty. (related work: quality of a generator: novelty)

In [206]:
leaderboard(df_no_copy)

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TransFusion,TimeVQVAE,TimeVQVAE,TTS-GAN,TTS-GAN,Time-Transformer
D3,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer
D4,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer,Time-Transformer,Time-Transformer
D5,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,Time-Transformer,Time-Transformer,Time-Transformer
D6,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer
D7,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer


The leaderboard without the cheating JustCopy method TransFusion is the best method in most pairings, but all methods except for TTS-GAN are best in some measure-dataset combination. Notably TransFusion is not best in ED and DTW in most datasets, overtaken by TimeVQVAE and Time-Transformer respectively. One of the clearest results is Time-Transformer being consistently the fastest method. TTS-GAN makes no appearence on the leaderboard.

In [207]:
worst_board(df_all)

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,Time-Transformer,Time-Transformer,TTS-GAN,TransFusion,Time-Transformer,Time-Transformer,TTS-GAN,Time-Transformer,Time-Transformer,TTS-GAN
D3,Time-Transformer,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TimeGAN
D4,TimeGAN,TTS-GAN,TTS-GAN,TimeGAN,TTS-GAN,TTS-GAN,TTS-GAN,TimeGAN,TimeGAN,TimeGAN
D5,TimeVQVAE,Time-Transformer,TimeVQVAE,TTS-GAN,TimeVQVAE,TTS-GAN,TimeVQVAE,TimeGAN,TimeGAN,TTS-GAN
D6,TTS-GAN,Time-Transformer,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TTS-GAN,TransFusion
D7,TTS-GAN,Time-Transformer,TimeGAN,TTS-GAN,TTS-GAN,TimeVQVAE,TimeVQVAE,TTS-GAN,TTS-GAN,TimeGAN


Looking at the worst scores TTS-GAN is very consistently ranks last in all metrics on all datasets, except for PS and Time, where it only scores last on 3 and 2 of 6 datasets respectively.

In [208]:
style_results_table(
    df_all
        .groupby(level="Method").mean()
        .loc[ranking_order]
    )

Unnamed: 0_level_0,PS,DS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
JustCopy,0.113,120.0,-6.73×10⁻¹³,231.0,0.0,0.0,0.0,0,0,0.0
TransFusion,0.114,0.115,2.41×10⁻²,0.344,0.122,0.176,0.718,2.1,8.13,527.0
TimeVQVAE,0.146,0.379,2.3,0.509,0.551,0.315,1.27,1.99,7.73,84.8
Time-Transformer,0.143,0.396,2.11,0.534,0.495,0.378,1.41,2.03,7.91,42.5
TimeGAN,0.143,0.286,2.49,0.583,0.717,0.346,1.31,2.14,8.22,961.0
TTS-GAN,0.145,0.406,3.49×10¹⁰,0.78,2.1,0.596,1.84,7.41×10⁴,4.44×10⁵,696.0


For a better understanding of the margins by which the methods differ, we can look at the mean scores of each methods metrics over all datasets. The table paints a pretty clear picture in two regards. First, JustCopy plays the benchmark, scoring perfect or near perfect scores in all measures. Second, TTS-GANs scores are worse than all other benchmarked methods by, in cases, orders of magnitude in all metrics except for Time.

conclusion notes:
- TTS-GAN off the chart bad
- Just Copy perfect scores (whats going on with MDD and C-FID?)
- ED and DTW show almost no difference between methods (except for TTS-GAN)

In [209]:
all_measures_ranking

To arrive at an overall Ranking of the benchmarked methods, I followed the approach presented by [todo cite angeetal] to compute the average ranking of each method in each measure across all datasets. To statistically validate the rankings, a Friedman test [cite] along with a Conover's test for ranking comparisons was employed. Figure [above] presents the average rankings of each method. Methods that the Conover's test did not find statistically different are connected with a horizontal bar.

TransFusion ranks best, followed by TimeVQVAE, Time-Transformer, TimeGAN in the middle, and last by a margin TTS-GAN. The Conover's test clearly separates TransFusion to rank highest and TTS-GAN to rank lowest. In the middle the Conover's test only separates TimeVQVAE from TimeGAN, with Time-Transformer between them not ranking significantly lower than one or higher than the other.

In [210]:
fig_spider_by_metric

The spider plot shows the average ranking of each method per metric. The average ranking on the metrics measuring the closeness of the distributions (KD, SD, ACD, MDD and C-FID) is consistent with the overall ranking result, TransFusion being clearly first, TTS-GAN clearly last and Time-Transformer, TimeGAN and TimeVQVAE close together in the middle, with TimeVQVAE consistently taking the second rank. The model based metrics (PS, DS) show the same first and last place, but interestingly TimeGAN is consistently second. The average ranking for the distance based measures (ED, DTW) differs strongly from the overall ranking, with TimeVQVAE and Time-Transformer ranking first and second, followed by TransFusion in just the third place, TimeGAN fourth and TTS-GAN last. The average ranking of the methods in the Time metric is again distinctly from the overall ranking, with Time-Transformer consistently ranking first, followed by Time-VQVAE, TransFusion, TTS-GAN and last, TimeGAN.

conclusion notes:
- overall ranking biased towards the thingn measured by the most measures
- distance based measures have higher values on longer sequence lengths
- ED and DTW rankings look more dramatic than the numeric results would warrant

In [211]:
fig_spider_tsgbench_measure_groups

In [212]:
fig_spider_utility_fidelity_efficiency

In [215]:
fig_time_plot

In [216]:
fig_spider_by_dataset

Figure [fig] shows the average rankings of each method by dataset. The training-data with the longer sequence length git 

### <s>Impact of Training Data Sequence Lenght on Benchmark Performance</s>