In [1]:
import os

# go to root directory if needed
print(f"Current working directory: {os.getcwd()}")
if os.getcwd().split("/")[-1] == "notebooks":
    os.chdir("..")
    print(f"Changed working directory to: {os.getcwd()}")

Current working directory: /Users/gabriel.torres/Nextcloud/Development/Pro5D/FlareSense/notebooks
Changed working directory to: /Users/gabriel.torres/Nextcloud/Development/Pro5D/FlareSense


In [2]:
import torch
import src.utils.data as data

from torchvision import transforms

# Settings
torch.manual_seed(0)

<torch._C.Generator at 0x1164d2d90>

In [3]:
# unzip data/raw/burst_list.zip if not already unzipped
if not os.path.exists("data/raw/burst_images"):
    os.system("unzip -q data/raw/burst_images.zip -x '__MACOSX/*' -d data/raw/burst_images/")
    print("unzipped data/raw/burst_list.zip to data/raw/burst_images")
else:
    print("data/raw/burst_list already exists, skipping unzipping")

data/raw/burst_list already exists, skipping unzipping


Loading data

In [4]:
# Als DataModule (Trainings-, Validierungs- und Testdaten unterteilt)
data_folder_path = "data/raw/burst_images/"

data_module = data.ECallistoDataModule(
    data_folder=data_folder_path,
    transform=transforms.Compose(
        [
            transforms.Resize((193, 240), antialias=True),
        ]
    ),
    batch_size=32,
    num_workers=0,
    val_ratio=0.2,
    test_ratio=0.2,
)
data_module.setup()

In [5]:
batch_data, batch_filenames, batch_labels = next(iter(data_module.train_dataloader()))

first_data_in_batch = batch_data[0]
first_timestamp_in_batch = batch_filenames[0]
first_folder_number_in_batch = batch_labels[0]

print("First data in batch:", first_data_in_batch)
print("Timestamp of first data:", first_timestamp_in_batch)
print("Folder number of data:", first_folder_number_in_batch)

First data in batch: tensor([[ 56,  54,  54,  ...,  56,  56,  56],
        [126, 151, 100,  ..., 151, 179, 165],
        [147, 144, 118,  ..., 147, 162, 155],
        ...,
        [  7,   3,   7,  ...,   3,   3,   7],
        [  0,   3,   3,  ...,   0,   3,   7],
        [  3,   7,   3,  ...,   3,   0,   3]], dtype=torch.uint8)
Timestamp of first data: 2022-06-08 00-55-00_2022-06-08 00-56-00_australia_assa_62_None_no_burst.png
Folder number of data: no_burst


Get samples per class for each dataloader

In [6]:
def count_samples_per_class(dataloader):
    count = {}
    for _, _, label in dataloader.dataset:
        if label not in count:
            count[label] = 0
        count[label] += 1
    return dict(sorted(count.items()))

print(
    f"Distribution of classes in training set: {count_samples_per_class(data_module.train_dataloader())}"
)

print(
    f"Distribution of classes in validation set: {count_samples_per_class(data_module.val_dataloader())}"
)

print(
    f"Distribution of classes in test set: {count_samples_per_class(data_module.test_dataloader())}"
)

Distribution of classes in training set: {'2': 58, '3': 204, '4': 9, '5': 4, '6': 440, 'no_burst': 51170}
Distribution of classes in validation set: {'2': 18, '3': 67, '4': 3, '6': 146, 'no_burst': 17056}
Distribution of classes in test set: {'2': 18, '3': 67, '4': 3, '6': 146, 'no_burst': 17056}


and check with number of rows in the dataset

In [7]:
print(f"Train Dataset Length: {len(data_module.train_dataset)}")
print(f"Val Dataset Length: {len(data_module.val_dataset)}")
print(f"Test Dataset Length: {len(data_module.test_dataset)}")

Train Dataset Length: 51885
Val Dataset Length: 17290
Test Dataset Length: 17290
