In [None]:
import glob
import json

# Data Preparation

In [None]:
files = glob.glob("../logs/h5/*/*.json")
experiments = []
data = {}

In [None]:
for file in files:
    dir_name, file_name = file.split("/")[-2:]
    exp_name = "-".join(dir_name.split("-")[:3])
    if "cuda" in file_name:
        exp_name += "-cuda"
    if "full_dataset" in file_name:
        exp_name += "-full"
        
    if exp_name not in experiments:
        experiments.append(exp_name)
        data[exp_name] = {}
    with open(file, "r") as f:
        if "-10" in dir_name or "-1" in dir_name or "-0.1" in dir_name:
            data[exp_name]["low"] = json.load(f)
        elif "-50" in dir_name or "-3" in dir_name or "-4" in dir_name or "-0.05" in dir_name:
            data[exp_name]["med"] = json.load(f)
        elif "-85" in dir_name or "-6" in dir_name or "-8" in dir_name or "-0.01" in dir_name:
            data[exp_name]["high"] = json.load(f)
        else:
            raise ValueError(f"{dir_name} does not fit naming scheme.")

### Dataset sizes
Measured with `du -d 1 -b data-ssd/imagenet-5/ | sort -rh`.


In [None]:
dataset_size = {}
dataset_size["raw"] = 703498109

dataset_size["jpeg"] = {"high": 295403558, "med": 139170327, "low": 51858450}
dataset_size["webp"] = {"high": 237491660, "med": 106598158, "low": 44723084}

dataset_size["bmshj2018_factorized-mse-mp"] = {"high": 332744016, "med": 73175328, "low": 23734012}
dataset_size["bmshj2018_factorized-mse"] = {"high": 78067476, "med": 19933952, "low": 7126100}
dataset_size["bmshj2018_hyperprior-mse"] = {"high": 68361296, "med": 19188384, "low": 6993100}
dataset_size["mbt2018_mean-mse"] = {"high": 67897956, "med": 19405612, "low": 7007120}
dataset_size["mbt2018-mse"] = {"high": 63074772, "med": 16761428, "low": 5325648}
dataset_size["cheng2020_anchor-mse"] = {"high": 29952300, "med": 11134628, "low": 5736252}
dataset_size["cheng2020_attn-mse"] = {"high": 29875688, "med": 10596044, "low": 5377020}

dataset_size["lossyless"] = {"high": 2715924, "med": 1620720, "low": 1259204}

# Analysis

## Compression Factor

In [None]:
for key in dataset_size.keys():
    if key == "raw" or "mp" in key:
        continue
    print(f"{key:30}", end="")
    print(f"${dataset_size['raw'] / dataset_size[key]['low']:.0f}\\times$", end=" & ")
    print(f"${dataset_size['raw'] / dataset_size[key]['med']:.0f}\\times$", end=" & ")
    print(f"${dataset_size['raw'] / dataset_size[key]['high']:.0f}\\times$", end=" & \n")

jpeg                          $14\times$ & $5\times$ & $2\times$ & 
webp                          $16\times$ & $7\times$ & $3\times$ & 
bmshj2018_factorized-mse      $99\times$ & $35\times$ & $9\times$ & 
bmshj2018_hyperprior-mse      $101\times$ & $37\times$ & $10\times$ & 
mbt2018_mean-mse              $100\times$ & $36\times$ & $10\times$ & 
mbt2018-mse                   $132\times$ & $42\times$ & $11\times$ & 
cheng2020_anchor-mse          $123\times$ & $63\times$ & $23\times$ & 
cheng2020_attn-mse            $131\times$ & $66\times$ & $24\times$ & 
lossyless                     $559\times$ & $434\times$ & $259\times$ & 


#### Multiprocessing vs data loader comparison

In [None]:
for key in ["bmshj2018_factorized-mse", "bmshj2018_factorized-mse-mp"]:
    print(f"{key:30}", end="")
    print(f"{dataset_size['raw'] / dataset_size[key]['low']:.2f}", end=" & ")
    print(f"{dataset_size['raw'] / dataset_size[key]['med']:.2f}", end=" & ")
    print(f"{dataset_size['raw'] / dataset_size[key]['high']:.2f}", end=" & \n")

bmshj2018_factorized-mse      98.72 & 35.29 & 9.01 & 
bmshj2018_factorized-mse-mp   29.64 & 9.61 & 2.11 & 


## Encoding Time

In [None]:
for exp in sorted([exp for exp in experiments if "enc" in exp and ("jpeg" in exp or "webp" in exp or "cuda" in exp or "lossyless" in exp) and "full" not in exp]):
    print(f"{exp:35}", end="")
    print(f"{data[exp]['low']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['med']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['high']['total_time']:.0f}", end=" & \n")

enc-dl-bmshj2018_factorized-cuda   54 & 64 & 122 & 
enc-dl-bmshj2018_hyperprior-cuda   93 & 99 & 127 & 
enc-dl-cheng2020_anchor-cuda       3516 & 3481 & 3516 & 
enc-dl-cheng2020_attn-cuda         3494 & 3656 & 3564 & 
enc-dl-lossyless                   38 & 42 & 41 & 
enc-dl-mbt2018-cuda                3522 & 3540 & 3775 & 
enc-dl-mbt2018_mean-cuda           95 & 100 & 127 & 
enc-mp-jpeg                        4 & 5 & 4 & 
enc-mp-webp                        6 & 5 & 7 & 


#### CUDA vs CPU comparison

In [None]:
for exp in sorted([exp for exp in experiments if "enc" in exp and "mp" not in exp and "jpeg" not in exp and "webp" not in exp and "lossyless" not in exp and "full" not in exp]):
    print(f"{exp:35}", end="")
    print(f"{data[exp]['low']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['med']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['high']['total_time']:.0f}", end=" & \n")

enc-dl-bmshj2018_factorized        138 & 147 & 251 & 
enc-dl-bmshj2018_factorized-cuda   54 & 64 & 122 & 
enc-dl-bmshj2018_hyperprior        237 & 246 & 373 & 
enc-dl-bmshj2018_hyperprior-cuda   93 & 99 & 127 & 
enc-dl-cheng2020_anchor            5654 & 6257 & 8448 & 
enc-dl-cheng2020_anchor-cuda       3516 & 3481 & 3516 & 
enc-dl-cheng2020_attn              6441 & 6177 & 7543 & 
enc-dl-cheng2020_attn-cuda         3494 & 3656 & 3564 & 
enc-dl-mbt2018                     8229 & 7219 & 13328 & 
enc-dl-mbt2018-cuda                3522 & 3540 & 3775 & 
enc-dl-mbt2018_mean                291 & 297 & 519 & 
enc-dl-mbt2018_mean-cuda           95 & 100 & 127 & 


#### Multiprocessing vs data loader comparison

In [None]:
for exp in sorted(["enc-mp-bmshj2018_factorized", "enc-dl-bmshj2018_factorized", "enc-dl-bmshj2018_factorized-cuda"]):
    print(f"{exp:35}", end="")
    print(f"{data[exp]['low']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['med']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['high']['total_time']:.0f}", end="\n")

enc-dl-bmshj2018_factorized        138 & 147 & 251
enc-dl-bmshj2018_factorized-cuda   54 & 64 & 122
enc-mp-bmshj2018_factorized        838 & 826 & 1507


## Decoding Time

In [None]:
for exp in sorted([exp for exp in experiments if "dec" in exp and "full" not in exp]):
    print(f"{exp:35}", end="")
    print(f"{data[exp]['low']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['med']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['high']['total_time']:.0f}", end=" \n")

dec-dl-bmshj2018_factorized        1392 & 1727 & 1597 
dec-dl-bmshj2018_hyperprior        1753 & 1907 & 2130 
dec-dl-lossyless                   24 & 27 & 27 
dec-dl-mbt2018_mean                1843 & 1871 & 2229 
dec-mp-jpeg                        5 & 4 & 4 
dec-mp-webp                        3 & 4 & 5 


#### Full dataset encoding/decoding

In [None]:
for exp in sorted([exp for exp in experiments if "full" in exp]):
    print(f"{exp:35}", end="")
    print(f"{data[exp]['low']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['med']['total_time']:.0f}", end=" & ")
    print(f"{data[exp]['high']['total_time']:.0f}", end=" & \n")

dec-dl-lossyless-full              2738 & 2758 & 2909 & 
dec-mp-jpeg-full                   10 & 11 & 12 & 
dec-mp-webp-full                   14 & 17 & 17 & 
enc-dl-lossyless-full              1609 & 1653 & 1726 & 
enc-mp-jpeg-full                   105 & 111 & 116 & 
enc-mp-webp-full                   258 & 291 & 336 & 
