In [1]:
import torch

def print_dataset_info(path):
    # Indicate which dataset is being analyzed (just last part of the path)
    print("-" * 40)
    print("Analyzing dataset:", path.split("/")[-1])
    d = torch.load(path, map_location="cpu", weights_only=False)
    print("keys:", list(d.keys()))
    # for each key, print type and shape
    for key in d.keys():
        if isinstance(d[key], torch.Tensor):
            print(f"{key} shape: {d[key].shape}, dtype: {d[key].dtype}")
        else:
            print(f"{key} type: {type(d[key])}")
    # if possible and key exists, print uniques of the keys
    for key in d.keys():
        if isinstance(d[key], torch.Tensor):
            # if unique is > 150, just print number of uniques
            if torch.unique(d[key]).numel() <= 150:
                print(f"unique {key}:", torch.unique(d[key]))
            else:
                print(f"unique {key} count:", torch.unique(d[key]).numel())
        if isinstance(d[key], dict):
            print(f"{key} is a dict with keys:", list(d[key].keys()))
    # if possible print data statistics and also say if there are nans (exclude for statistics but warn if present)
    for key in d.keys():
        if isinstance(d[key], torch.Tensor) and d[key].dtype in [torch.float32, torch.float64]:
            if torch.isnan(d[key]).any():
                print(f"Warning: {key} contains NaNs")
            print(f"{key} min/max/mean/std:", float(d[key].min()), float(d[key].max()), float(d[key].mean()), float(d[key].std()))
    print("\n")
    print("-" * 40)
    


In [8]:
path = "/home/burger/canWeReally/data/processed_data/sleepedfx_data.pt"
print_dataset_info(path)


path = "/home/burger/canWeReally/data/processed_data/sleep_eeg_labram_features.pt"
print_dataset_info(path)

----------------------------------------
Analyzing dataset: sleepedfx_data.pt
keys: ['data', 'subjects', 'tasks', 'runs', 'labels', 'fold_info']
data shape: torch.Size([195632, 64, 240]), dtype: torch.float32
subjects shape: torch.Size([195632]), dtype: torch.int64
tasks shape: torch.Size([195632]), dtype: torch.int64
runs shape: torch.Size([195632]), dtype: torch.int64
labels shape: torch.Size([195632]), dtype: torch.int64
fold_info type: <class 'dict'>
unique data: tensor([-4.5384, -4.5383, -4.5381,  ...,  5.3019,  5.3029,  5.3044])
unique subjects: tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
        55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74,
        75, 76, 77, 80, 81, 82])
unique tasks: tensor([1, 2, 3, 4, 5])
unique runs: tensor([1, 2])
unique labels: tensor([

In [None]:
path = "/home/burger/canWeReally/data/processed_data/MI_eeg.pt"
print_dataset_info(path)

path = "/home/burger/canWeReally/data/processed_data/MI_eeg_481_labram_features.pt"
print_dataset_info(path)

----------------------------------------
Analyzing dataset: MI_eeg.pt
keys: ['X', 'y', 'ch_names', 'runs', 'subjects']
X shape: torch.Size([38045, 64, 481]), dtype: torch.float32
y shape: torch.Size([38045]), dtype: torch.int64
ch_names type: <class 'list'>
runs shape: torch.Size([38045]), dtype: torch.int64
subjects shape: torch.Size([38045]), dtype: torch.int64
unique X: tensor([-0.0010, -0.0010, -0.0010,  ...,  0.0010,  0.0010,  0.0011])
unique y: tensor([0, 1, 2, 3, 4])
unique runs: tensor([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])
unique subjects: tensor([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
         29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
         57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
         71,  72,  73,  74,  75,  76,  77,  

In [2]:
path = "/home/burger/canWeReally/data/processed_data/MI_eeg.pt"
#print_dataset_info(path)

path = "/home/burger/canWeReally/data/processed_data/MI_eeg_labram_features.pt"
#print_dataset_info(path)

path = "/home/burger/canWeReally/data/processed_data/MI_eeg_cbramod_features.pt"
print_dataset_info(path)

----------------------------------------
Analyzing dataset: MI_eeg_cbramod_features.pt
keys: ['features', 'labels', 'subjects', 'runs']
features shape: torch.Size([38045, 200]), dtype: torch.float32
labels shape: torch.Size([38045]), dtype: torch.int64
subjects shape: torch.Size([38045]), dtype: torch.int64
runs shape: torch.Size([38045]), dtype: torch.int64
unique features: tensor([-1.1363, -1.1313, -1.1290,  ...,  1.0748,  1.0817,  1.0837])
unique labels: tensor([0, 1, 2, 3, 4])
unique subjects: tensor([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
         29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
         57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
         71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
         85,  86,  87,  89,  

In [2]:
path = "/home/burger/canWeReally/data/processed_data/full_erp_data.pt"
print_dataset_info(path)

----------------------------------------
Analyzing dataset: full_erp_data.pt
keys: ['data', 'subjects', 'tasks', 'runs', 'labels', 'data_mean', 'data_std']
data shape: torch.Size([77417, 30, 256]), dtype: torch.float64
subjects shape: torch.Size([77417]), dtype: torch.int64
tasks shape: torch.Size([95858]), dtype: torch.float64
runs shape: torch.Size([77417]), dtype: torch.int64
labels type: <class 'dict'>
data_mean shape: torch.Size([1, 30, 256]), dtype: torch.float64
data_std shape: torch.Size([1, 30, 256]), dtype: torch.float64
unique data count: 126206655
unique subjects: tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40])
unique tasks count: 18455
unique runs: tensor([0, 1, 2, 3, 4, 5, 6])
labels is a dict with keys: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
unique data_mean count: 7680
unique data_std count: 7680
data min/max/mean/std: -0.00535

In [5]:
# inspect pretrained weights shapes etc. 
path = "/home/burger/canWeReally/weights/cbramod_pretrained_weights.pth"
print_dataset_info(path)

----------------------------------------
Analyzing dataset: cbramod_pretrained_weights.pth
keys: ['patch_embedding.mask_encoding', 'patch_embedding.positional_encoding.0.weight', 'patch_embedding.positional_encoding.0.bias', 'patch_embedding.proj_in.0.weight', 'patch_embedding.proj_in.0.bias', 'patch_embedding.proj_in.1.weight', 'patch_embedding.proj_in.1.bias', 'patch_embedding.proj_in.3.weight', 'patch_embedding.proj_in.3.bias', 'patch_embedding.proj_in.4.weight', 'patch_embedding.proj_in.4.bias', 'patch_embedding.proj_in.6.weight', 'patch_embedding.proj_in.6.bias', 'patch_embedding.proj_in.7.weight', 'patch_embedding.proj_in.7.bias', 'patch_embedding.spectral_proj.0.weight', 'patch_embedding.spectral_proj.0.bias', 'encoder.layers.0.self_attn_s.in_proj_weight', 'encoder.layers.0.self_attn_s.in_proj_bias', 'encoder.layers.0.self_attn_s.out_proj.weight', 'encoder.layers.0.self_attn_s.out_proj.bias', 'encoder.layers.0.self_attn_t.in_proj_weight', 'encoder.layers.0.self_attn_t.in_proj_bi

In [5]:
def print_dataset_info(path):
    print("-" * 40)
    print("Analyzing dataset:", path.split("/")[-1])
    d = torch.load(path, map_location="cpu", weights_only=False)
    print("keys:", list(d.keys()))
    
    # Print info based on available keys
    for key, value in d.items():
        print(f"\nKey: {key}")
        if isinstance(value, torch.Tensor):
            print(f"Type: {type(value)}")
            print(f"Shape: {value.shape}")
            if value.dtype in [torch.float32, torch.float64]:
                print(f"Min/Max/Mean/Std: {float(value.min()):.3f}/{float(value.max()):.3f}/{float(value.mean()):.3f}/{float(value.std()):.3f}")
            if value.dtype in [torch.int32, torch.int64, torch.long]:
                try:
                    unique_vals = torch.unique(value)
                    if len(unique_vals) < 10:  # Only print if not too many unique values
                        print(f"Unique values: {unique_vals.tolist()}")
                    else:
                        print(f"Number of unique values: {len(unique_vals)}")
                except:
                    pass
        else:
            print(f"Type: {type(value)}")
            if hasattr(value, 'shape'):
                print(f"Shape: {value.shape}")

        # investigate key "tasks" better
        if key == "tasks":
            print(f"Unique tasks: {torch.unique(value)}")

    print("-" * 40)

# Print info for ERP dataset
path = "/home/burger/canWeReally/data/processed_data/full_erp_data.pt"
print_dataset_info(path)

----------------------------------------
Analyzing dataset: full_erp_data.pt
keys: ['data', 'subjects', 'tasks', 'runs', 'labels', 'data_mean', 'data_std']

Key: data
Type: <class 'torch.Tensor'>
Shape: torch.Size([77417, 30, 256])
Min/Max/Mean/Std: -0.005/0.006/0.000/0.000

Key: subjects
Type: <class 'torch.Tensor'>
Shape: torch.Size([77417])
Number of unique values: 40

Key: tasks
Type: <class 'torch.Tensor'>
Shape: torch.Size([95858])
Min/Max/Mean/Std: nan/nan/nan/nan
Unique tasks: tensor([0., 1., 2.,  ..., nan, nan, nan], dtype=torch.float64)

Key: runs
Type: <class 'torch.Tensor'>
Shape: torch.Size([77417])
Unique values: [0, 1, 2, 3, 4, 5, 6]

Key: labels
Type: <class 'dict'>

Key: data_mean
Type: <class 'torch.Tensor'>
Shape: torch.Size([1, 30, 256])
Min/Max/Mean/Std: -0.000/0.000/0.000/0.000

Key: data_std
Type: <class 'torch.Tensor'>
Shape: torch.Size([1, 30, 256])
Min/Max/Mean/Std: 0.000/0.000/0.000/0.000
----------------------------------------
