### Notebook to get the sizes of different datasets
This notebook loads different datasets and computes their sizes (number of samples).

In [None]:
import torch
from collections import defaultdict
import pandas as pd
from ..constants import BASE_PATH_PROJECT

In [None]:
seen_ds = defaultdict(dict)
base_path = BASE_PATH_PROJECT / "features"
for path in base_path.rglob("vit_base_patch16_224_cls/targets_train.pt"):
    if len(path.parts)!= 8:
        continue
    print(path)
    seen_ds[path.parts[-3]][path.parts[-2]] = torch.load(path)

In [None]:
len(seen_ds.keys()), seen_ds.keys()

In [None]:
ds_dist = {}
class_weights = {}
nr_samples = {}
nr_classes = {}
for ds, ds_data in seen_ds.items():
    first_targets = ds_data['vit_base_patch16_224_cls']
    ds_dist[ds] = torch.bincount(first_targets)
    total_count = ds_dist[ds].sum()
    class_weights[ds] = torch.where(ds_dist[ds] == 0, 0, total_count / (len(ds_dist[ds]) * ds_dist[ds]))
    nr_samples[ds] = int(first_targets.shape[0])
    nr_classes[ds] = len(ds_dist[ds])
    

In [None]:
imbalance_ratios = []
for ds, ds_data in ds_dist.items():
    imbalance_ratios.append((ds, (ds_data.max()/ds_data.min()).item()))

In [None]:
imbalance_ratios = pd.DataFrame(imbalance_ratios, columns=['dataset', 'imbalance_ratio']).sort_values('imbalance_ratio')
imbalance_ratios.set_index('dataset', inplace=True)

In [None]:
imbalance_ratios.loc[list(nr_samples.keys()), 'nr_samples'] = list(nr_samples.values())
imbalance_ratios.loc[list(nr_classes.keys()), 'nr_classes'] = list(nr_classes.values())

In [None]:
imbalance_ratios