In [1]:
"""
This notebook is used to find the relation between LID values, estimated using Flextok's flow matching model, and the 
reconstruction error.
"""

"\nThis notebook is used to find the relation between LID values, estimated using Flextok's flow matching model, and the \nreconstruction error.\n"

In [1]:
"""
Objective: Plot reconstruction loss versus LID (Local Intrinsic Dimensionality) for fixed k values.
This mirrors the edge-ratio analysis and assumes:
- Reconstruction metrics are stored in ../data/datasets/reconstruction_loss_imgnet_train/reconstruction_errors_all.json
- LID values are stored as an ordered list aligned with dataset indices, e.g.:
  ../data/datasets/imageNet_LID_values/train_imageNet_lid_values.json
If your LID path differs, update LID_JSON below.
"""
import json
import numpy as np
import matplotlib.pyplot as plt
import os, sys

sys.path.append(os.path.abspath("../.."))  # allow relative imports if needed
print(os.path.abspath(".."))
from data.utils.dataloaders import ReconstructionDataset_Heuristic


/BS/data_mani_compress/work/thesis/thesis/notebooks


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths (adjust if your files differ)
RECON_JSON = "/BS/data_mani_compress/work/thesis/thesis/data/datasets/imagenet_reconstruction_losses/val_categorized/all_losses.json"

# Load reconstruction dataset (list of dicts with fields: image_id, k_value, vgg_error, mse_error, ...)
with open(RECON_JSON, 'r') as f:
    reconstruction_dataset = json.load(f)

print('Sizes:', len(reconstruction_dataset))

Sizes: 450000


In [2]:
k_values = [1,2,4,8,16,32,64,128,256]
best_lids_per_k = {}
for k in k_values:
    LID_JSON   = "/BS/data_mani_compress/work/thesis/thesis/data/datasets/imageNet_LID_values/flextok_based/original_images/train/t_0.32/combined.json"
    # Load LID values; we assume it's ordered so that index i corresponds to image 
    with open(LID_JSON, 'r') as f:
        lid_values = json.load(f)
    paired = [lid_values[i] for i in range(0, len(lid_values), 2)]
    print(len(paired), paired[0])
    best_lids_per_k[k] = paired

640584 [23166.384765625, 10963.4345703125]
640584 [23166.384765625, 10963.4345703125]
640584 [23166.384765625, 10963.4345703125]
640584 [23166.384765625, 10963.4345703125]
640584 [23166.384765625, 10963.4345703125]
640584 [23166.384765625, 10963.4345703125]
640584 [23166.384765625, 10963.4345703125]
640584 [23166.384765625, 10963.4345703125]
640584 [23166.384765625, 10963.4345703125]


In [6]:
k_values = [1,2,4,8,16,32,64,128,256]
best_lids_per_k = {}
for k in k_values:
    LID_JSON   = "/BS/data_mani_compress/work/thesis/thesis/data/datasets/imageNet_LID_values/flextok_based/original_images/val/t_0.32/lid_0000_0329.json"
    # Load LID values; we assume it's ordered so that index i corresponds to image 
    with open(LID_JSON, 'r') as f:
        lid_values = json.load(f)
    paired = [lid_values[i:i+2] for i in range(0, len(lid_values), 2)]
    print(len(paired), paired[0])
    best_lids_per_k[k] = paired

50000 [19562.119140625, 7954.99072265625]
50000 [19562.119140625, 7954.99072265625]
50000 [19562.119140625, 7954.99072265625]
50000 [19562.119140625, 7954.99072265625]
50000 [19562.119140625, 7954.99072265625]
50000 [19562.119140625, 7954.99072265625]
50000 [19562.119140625, 7954.99072265625]
50000 [19562.119140625, 7954.99072265625]
50000 [19562.119140625, 7954.99072265625]


In [7]:
ds = ReconstructionDataset_Heuristic(
        reconstruction_data=reconstruction_dataset,  
        lid_information=best_lids_per_k,
        error_key=["LPIPS"]
    )

In [8]:
for k in k_values:
    lids = []
    errors = []
    for i in range(len(ds)):
        item = ds[i]
        if item['k_value'] == k:
            lids.append(item['lid'][0])  
            errors.append(item['LPIPS'])
    
    # find correlation
    correlation = np.corrcoef(lids, errors)[0, 1]
    print(f'K={k}: Correlation between LID and LPIPS: {correlation:.4f}')

K=1: Correlation between LID and LPIPS: -0.0366
K=2: Correlation between LID and LPIPS: -0.1282
K=4: Correlation between LID and LPIPS: -0.3118
K=8: Correlation between LID and LPIPS: -0.3613
K=16: Correlation between LID and LPIPS: -0.4366
K=32: Correlation between LID and LPIPS: -0.5275
K=64: Correlation between LID and LPIPS: -0.6148
K=128: Correlation between LID and LPIPS: -0.7145
K=256: Correlation between LID and LPIPS: -0.8031
