In [1]:
import json
import pandas as pd
import warnings
import seaborn as sns
from pyprojroot import here
import plotly.express as px

methods = ["JustCopy", "TimeGAN", "Time-Transformer", "TransFusion", "TTS-GAN", "TimeVQVAE"]
datasets = ["D2", "D3", "D4", "D5", "D6", "D7"]

rows = []

for method in methods:
    for dataset in datasets:
        file_candidates = list(here('result').glob(f'numeric_{method}_{dataset}_*.json'))
        if len(file_candidates) != 1:
            warnings.warn(f"Ignoring {method} {dataset}: Expected one result file for {method} {dataset}, instead matched {file_candidates}.")
            continue
        file_path = file_candidates[0]
        
        with open(file_path) as f:
            data = json.load(f)
        
        rows.append(((method, dataset), data))

df_all = pd.DataFrame.from_dict(dict(rows), orient="index")
df_all.index = pd.MultiIndex.from_tuples(df_all.index, names=["Method", "Dataset"])

# load timings
def timings_path(method):
    paths = list(here("models").glob(f"*{method}*/timings.csv"))
    assert len(paths) == 1
    return paths[0]

timings = pd.DataFrame()
for method in methods:
    df = pd.read_csv(timings_path(method))
    df["Method"] = method
    timings = pd.concat([timings, df], ignore_index=True)

_timings = timings.melt(id_vars="Method", var_name="Dataset", value_name="Time").set_index(["Method", "Dataset"]).sort_index(level="Method")

df_all = pd.concat([df_all, _timings], axis = 1)

In [2]:
cm = sns.diverging_palette(h_neg=130, h_pos=0, as_cmap=True)

def style_results_table(df):
    return df.style\
    .background_gradient(cmap=cm, vmin=0, vmax=0.5, subset=["DS", "PS"])\
    .background_gradient(cmap=cm, vmin=0, vmax=1, subset=["C-FID"])\
    .background_gradient(cmap=cm, vmin=0, vmax=2, subset=["MDD"])\
    .background_gradient(cmap=cm, vmin=0, vmax=1, subset=["ACD"])\
    .background_gradient(cmap=cm, vmin=0, vmax=1, subset=["SD"])\
    .background_gradient(cmap=cm, vmin=0, vmax=3, subset=["KD"])\
    .background_gradient(cmap=cm, vmin=0, vmax=3, subset=["ED"])\
    .background_gradient(cmap=cm, vmin=0, vmax=15, subset=["DTW"])\
    .background_gradient(cmap=cm, vmin=0, vmax=1000, subset=["Time"])\
    .format({"DTW": "{:.2e}", "C-FID": "{:.2e}", "ED": "{:.2e}"})

# Note: 
# Some methods were trained on old D7 and or evaluated on old D7
df_no_copy = df_all.drop(index="JustCopy", level="Method")
style_results_table(df_all)

Unnamed: 0_level_0,Unnamed: 1_level_0,DS,PS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Method,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
JustCopy,D2,0.010101,0.03772,-1.67e-12,0.000317,0.0,0.0,0.0,0.0,0.0,0
JustCopy,D3,0.01014,0.036687,-2.12e-12,0.000282,0.0,0.0,0.0,0.0,0.0,0
JustCopy,D4,0.013775,0.05706,-2.43e-13,0.000264,0.0,0.0,0.0,0.0,0.0,0
JustCopy,D5,0.016671,0.250396,-2.42e-15,0.060657,0.0,0.0,0.0,0.0,0.0,0
JustCopy,D6,0.016478,0.248458,0.000134,0.08606,0.004639,0.0045,0.007488,2.55,14.6,0
JustCopy,D7,0.011593,0.049266,4.34e-15,0.005795,0.0,0.0,0.0,0.0,0.0,0
TTS-GAN,D2,0.5,0.136,1.73e+17,1.002635,4.518185,1.487815,3.213125,159000000.0,565000000.0,646
TTS-GAN,D3,0.48908,0.694247,240000000000.0,1.020405,7.752654,1.268523,3.031921,1060000.0,2700000.0,660
TTS-GAN,D4,0.404567,0.273916,58500000000000.0,1.015425,7.849365,0.333326,1.879808,14600000.0,42200000.0,649
TTS-GAN,D5,0.5,0.279841,119000000000.0,1.009141,4.521533,1.397439,1.732348,86400.0,570000.0,649


In [3]:
style_results_table(df_all.swaplevel("Method", "Dataset")\
    .sort_index(level=["Dataset", "Method"]))

Unnamed: 0_level_0,Unnamed: 1_level_0,DS,PS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
D2,JustCopy,0.010101,0.03772,-1.67e-12,0.000317,0.0,0.0,0.0,0.0,0.0,0
D2,TTS-GAN,0.5,0.136,1.73e+17,1.002635,4.518185,1.487815,3.213125,159000000.0,565000000.0,646
D2,Time-Transformer,0.324495,0.085745,0.266,0.629389,0.24659,0.582419,2.577668,1.31,3.38,13
D2,TimeGAN,0.093434,0.037907,0.0201,0.443126,0.174772,0.472857,2.370855,1.18,3.05,259
D2,TimeVQVAE,0.271212,0.042083,0.0861,0.600689,0.151148,0.300332,2.007588,1.16,2.99,73
D2,TransFusion,0.14697,0.039753,0.103,0.793641,0.033649,0.533438,2.629961,1.07,2.77,43
D3,JustCopy,0.01014,0.036687,-2.12e-12,0.000282,0.0,0.0,0.0,0.0,0.0,0
D3,TTS-GAN,0.48908,0.694247,240000000000.0,1.020405,7.752654,1.268523,3.031921,1060000.0,2700000.0,660
D3,Time-Transformer,0.297712,0.071347,0.185,0.747137,0.489822,0.532613,2.88285,2.48,6.3,16
D3,TimeGAN,0.224909,0.039703,0.0542,0.864601,0.144977,0.272758,2.354675,2.64,6.67,1373


## Spider Plots

In [21]:
rankings = df_no_copy.unstack(level=1).rank()

ranking_by_metric = rankings.stack().reset_index().groupby("Method").mean()
spider_by_metric = ranking_by_metric.reset_index().melt(id_vars="Method")
px.line_polar(spider_by_metric, r="value", theta="variable", color="Method", line_close=True) \
    .update_layout(polar={"radialaxis": {"range": [5.9, 1], "dtick": 1}})

In [22]:
ranking_by_dataset = rankings.stack(level=0).reset_index().groupby("Method").mean()
spider_by_dataset = ranking_by_dataset.reset_index().melt(id_vars="Method")
px.line_polar(spider_by_dataset, r="value", theta="Dataset", color="Method", line_close=True) \
    .update_layout(polar={"radialaxis": {"range": [5.9,1], "dtick": 1}})

In [23]:
import plotly.graph_objects as go


def plot_average_rankings(df, title, y_offsets= [1, -1, 1, -1, 1, -1]):
    s = df.unstack(level=1).rank().mean(axis=1)

    colors = [color for _, color in zip(methods, px.colors.qualitative.Plotly)]


    fig = go.Figure(go.Scatter(
        x=s.values,
        y=[0]*len(s),
        mode="markers+text",
        text=s.index,
        textfont=dict(color=colors),
        textposition=["top center" if y>0 else "bottom center" for y in y_offsets],
        marker=dict(size=12, color=colors)
    ))

    range_max = len(df.index.get_level_values(level="Method").unique())
    print(range_max)

    # Thin horizontal number line
    fig.add_shape(type="line",
                x0=1, x1=range_max,
                y0=0, y1=0,
                line=dict(color="black", width=1),
                layer="below")

    # Layout tweaks for minimal look
    fig.update_yaxes(visible=False)
    fig.update_xaxes(range=[0.8, range_max +0.2], showgrid=True, zeroline=False)
    fig.update_layout(
        height=200,
        xaxis_title="Average Rank",
        yaxis_title="",
        showlegend=False,
        margin=dict(t=60, b=60, l=150, r=150),
        title=title,
    )

    fig.show()

plot_average_rankings(df_no_copy, "All Measures", [-1, 1, -1, -1, 1])


5


In [24]:
import scikit_posthocs as sp

def conover_test(df):
    df = df.unstack(level=1).rank().T.reset_index().melt(id_vars=['level_0', 'Dataset'], var_name='Method', value_name='Rank')
    posthoc = sp.posthoc_conover(df, group_col='Method', val_col='Rank', p_adjust='holm')

    def highlight_below_threshold(val):
        color = 'background-color: turquoise' if val < 0.01 else ''
        return color

    return posthoc.style.applymap(highlight_below_threshold).format("{:.2e}")

conover_test(df_no_copy)

Unnamed: 0,TTS-GAN,Time-Transformer,TimeGAN,TimeVQVAE,TransFusion
TTS-GAN,1.0,5.420000000000001e-21,7e-18,3.41e-26,1.85e-38
Time-Transformer,5.420000000000001e-21,1.0,0.333,0.255,7.61e-06
TimeGAN,7e-18,0.333,1.0,0.039,6.7e-08
TimeVQVAE,3.41e-26,0.255,0.039,1.0,0.00329
TransFusion,1.85e-38,7.61e-06,6.7e-08,0.00329,1.0


In [8]:
plot_average_rankings(df_all.drop(index="JustCopy", level="Method").drop(columns=["PS", "Time"]), "Fidelity")

5


In [25]:
conover_test(df_no_copy.drop(columns=["PS", "Time"]))

Unnamed: 0,TTS-GAN,Time-Transformer,TimeGAN,TimeVQVAE,TransFusion
TTS-GAN,1.0,5.52e-19,1.38e-20,3.520000000000001e-27,5.36e-39
Time-Transformer,5.52e-19,1.0,0.594,0.0299,1.85e-08
TimeGAN,1.38e-20,0.594,1.0,0.0802,2.59e-07
TimeVQVAE,3.520000000000001e-27,0.0299,0.0802,1.0,0.00177
TransFusion,5.36e-39,1.85e-08,2.59e-07,0.00177,1.0


In [26]:
plot_average_rankings(df_no_copy.drop(columns=["Time"]), "Without Timing")

5


In [27]:
conover_test(df_no_copy.drop(columns=["Time"]))

Unnamed: 0,TTS-GAN,Time-Transformer,TimeGAN,TimeVQVAE,TransFusion
TTS-GAN,1.0,2.74e-19,1.91e-22,3.42e-27,5.620000000000001e-43
Time-Transformer,2.74e-19,1.0,0.323,0.0494,2.09e-10
TimeGAN,1.91e-22,0.323,1.0,0.311,4.84e-08
TimeVQVAE,3.42e-27,0.0494,0.311,1.0,4.05e-05
TransFusion,5.620000000000001e-43,2.09e-10,4.84e-08,4.05e-05,1.0


In [28]:
plot_average_rankings(df_no_copy["PS"], "Predictive Score", [1, 1, 1, -1, 1])

5


In [29]:
conover_test(df_no_copy["PS"].to_frame())

Unnamed: 0,TTS-GAN,Time-Transformer,TimeGAN,TimeVQVAE,TransFusion
TTS-GAN,1.0,0.379,0.0241,0.379,0.000182
Time-Transformer,0.379,1.0,0.531,1.0,0.0241
TimeGAN,0.0241,0.531,1.0,0.531,0.379
TimeVQVAE,0.379,1.0,0.531,1.0,0.0241
TransFusion,0.000182,0.0241,0.379,0.0241,1.0


In [30]:
from scipy.stats import friedmanchisquare

df = df_no_copy.unstack(level=1).T
ranks = df.rank(axis=0, method='average', ascending=False)

stat, p = friedmanchisquare(*[ranks.loc[m].values for m in ranks.index])
p

6.268300811770057e-19

In [31]:
import pandas as pd

def leaderboard(df):
    out = pd.DataFrame(index=df.index.levels[1], columns=df.columns)

    for lvl2 in df.index.levels[1]:
        subset = df.xs(lvl2, level=1)
        min_idx = subset.idxmin()
        out.loc[lvl2] = min_idx


    def color_cells(value):
        if value == "TTS-GAN":
            return 'background-color: yellow'
        if value == "TransFusion":
            return 'background-color: lightgreen'
        if value == "TimeGAN":
            return 'background-color: lightblue'
        if value == "Time-Transformer":
            return 'background-color: pink'
        if value == "TimeVQVAE":
            return 'background-color: turquoise'

    return out.style.applymap(color_cells)

leaderboard(df_no_copy)

Unnamed: 0_level_0,DS,PS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TransFusion,TimeVQVAE,TimeVQVAE,TransFusion,TransFusion,Time-Transformer
D3,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer
D4,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer,Time-Transformer,Time-Transformer
D5,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,Time-Transformer,Time-Transformer,Time-Transformer
D6,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TransFusion,TimeVQVAE,TimeVQVAE,Time-Transformer
D7,TransFusion,TimeGAN,TransFusion,TimeGAN,TransFusion,TimeVQVAE,TimeVQVAE,TransFusion,TransFusion,Time-Transformer


In [16]:
leaderboard(df_all[df_all.index.get_level_values(level=0).isin(["TTS-GAN", "TimeGAN"])])

Unnamed: 0_level_0,DS,PS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN
D3,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN
D4,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN
D5,TimeGAN,TTS-GAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN
D6,TTS-GAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN
D7,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TTS-GAN


In [17]:
leaderboard(df_all[df_all.index.get_level_values(level=0).isin(["Time-Transformer", "TimeGAN"])])

Unnamed: 0_level_0,DS,PS,C-FID,MDD,ACD,SD,KD,ED,DTW,Time
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
D2,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,TimeGAN,Time-Transformer
D3,TimeGAN,TimeGAN,TimeGAN,Time-Transformer,TimeGAN,TimeGAN,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer
D4,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer,TimeGAN,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer
D5,TimeGAN,TimeGAN,TimeGAN,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer
D6,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,Time-Transformer,TimeGAN,Time-Transformer,Time-Transformer,Time-Transformer
D7,TimeGAN,TimeGAN,TimeGAN,TimeGAN,Time-Transformer,TimeGAN,Time-Transformer,TimeGAN,TimeGAN,Time-Transformer


In [18]:
rankings

Unnamed: 0_level_0,DS,DS,DS,DS,DS,DS,PS,PS,PS,PS,...,DTW,DTW,DTW,DTW,Time,Time,Time,Time,Time,Time
Dataset,D2,D3,D4,D5,D6,D7,D2,D3,D4,D5,...,D4,D5,D6,D7,D2,D3,D4,D5,D6,D7
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
TTS-GAN,5.0,5.0,5.0,5.0,4.0,4.5,5.0,5.0,5.0,2.0,...,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,3.0,3.0
Time-Transformer,4.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,...,1.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0
TimeGAN,1.0,2.0,2.0,2.0,4.0,2.0,1.0,2.0,4.0,3.0,...,4.0,4.0,4.0,2.0,4.0,5.0,5.0,4.0,4.0,5.0
TimeVQVAE,3.0,3.0,4.0,3.0,2.0,4.5,3.0,3.0,2.0,5.0,...,2.0,3.0,1.0,4.0,3.0,2.0,2.0,2.0,2.0,2.0
TransFusion,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,3.0,2.0,3.0,1.0,2.0,3.0,3.0,3.0,5.0,4.0


In [32]:
df = df_no_copy.unstack(level=1).rank()
utility_cols = [col for col in df.columns if col[0] in ["PS"]]

fidelity_cols = [col for col in df.columns if col[0] not in ["PS", "Time"]]

timing_cols = [col for col in df.columns if col[0] in ["Time"]]

avg_utility = df[utility_cols].mean(axis=1)
avg_fidelity = df[fidelity_cols].mean(axis=1)
avg_timing = df[timing_cols].mean(axis=1)

new_df = pd.DataFrame({
    "Utility": avg_utility,
    "Fidelity": avg_fidelity,
    "Time": avg_timing
})

new_df.style.background_gradient(cmap=cm)

Unnamed: 0_level_0,Utility,Fidelity,Time
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TTS-GAN,4.5,4.90625,4.0
Time-Transformer,3.333333,2.979167,1.0
TimeGAN,2.5,2.875,4.5
TimeVQVAE,3.333333,2.46875,2.166667
TransFusion,1.333333,1.770833,3.333333


In [35]:
timings.set_index("Method").drop(index="JustCopy")

Unnamed: 0_level_0,D2,D3,D4,D5,D6,D7
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TimeGAN,259,1373,1195,304,1252,1385
Time-Transformer,13,16,28,66,77,55
TransFusion,43,222,454,201,1387,856
TTS-GAN,646,660,649,649,660,645
TimeVQVAE,73,94,94,74,88,86
