In [1]:
import os
import shutil
import warnings
import csv
import yaml
import json
import torch

from PIL import Image
import pandas as pd
import numpy as np
from typing import Any, Sequence

from os import PathLike
from torch.utils.data import Dataset

from megadetector.detection.run_detector import load_detector, model_string_to_model_version
from megadetector.detection.run_detector_batch import process_images, write_results_to_file


from sklearn.model_selection import train_test_split

In [15]:
import numpy as np


In [1]:
import random
import torch

from pathlib import Path
from PIL import Image

from torchvision.transforms import v2
from torch.utils.data import DataLoader

from ba_dev.dataset import MammaliaData, MammaliaDataImage, MammaliaDatasetFeatureStats
from ba_dev.transform import ImagePipeline, BatchImagePipeline
from ba_dev.utils import load_config_yaml

paths = load_config_yaml('../path_config.yml')


### Running Tests

In [3]:
path_to_dataset = paths['dataset']
path_labelfiles = paths['test_labels']
path_to_detector_output = paths['md_output']
detector_model='mdv5a'
mode='train'

dataset = MammaliaDataImage(
    path_labelfiles=path_labelfiles,
    path_to_dataset=path_to_dataset,
    path_to_detector_output=path_to_detector_output,
    detector_model=detector_model,
    mode=mode,
)

8 sequences had no detections and will be excluded.
Excluded sequences: [6000161, 6000163, 6000293, 6000530, 6000691, 6000372, 6000953, 6000186]


In [4]:
pipline = ImagePipeline(
                path_to_dataset=path_to_dataset,
                pre_ops = [
                    ('to_rgb', {}),
                    ('crop_by_bb', {})
                ],
                transform = v2.Compose([
                                v2.ToImage(),
                                v2.ToDtype(torch.float32, scale=True),
                                v2.Resize((224, 224)),
                                ])
                )
                

In [5]:
row = dataset[77]

image = pipline(row['file_path'], row['bbox'])

print(image.shape)

torch.Size([3, 224, 224])


In [6]:
batch_pipline = BatchImagePipeline(
                path_to_dataset=path_to_dataset,
                num_workers=4,
                pre_ops = [
                    ('to_rgb', {}),
                    ('crop_by_bb', {}),
                ],
                transform = v2.Compose([
                                v2.ToImage(),
                                v2.ToDtype(torch.float32, scale=True),
                                v2.Resize((224, 224)),
                                ])
                )

In [7]:
list_of_paths = []
list_of_bboxes = []

samples = [random.randint(0, len(dataset)) for _ in range(100)]

for i in samples:
    row = dataset[i]

    list_of_paths.append(row['file_path'])
    list_of_bboxes.append(row['bbox'])

images = batch_pipline(list_of_paths, list_of_bboxes)

for image in images:
    print(image.shape)

IndexError: list index out of range

### Tests Feature Stats

In [8]:
path_to_dataset = paths['dataset']
path_labelfiles = paths['test_labels']
path_to_detector_output = paths['md_output']
detector_model=None
mode='init'

dataset = MammaliaDatasetFeatureStats(
    path_labelfiles=path_labelfiles,
    path_to_dataset=path_to_dataset,
    path_to_detector_output=path_to_detector_output,
    detector_model=detector_model,
    mode=mode,
)

def collate_fn(batch):
    return batch

loader = DataLoader(
    dataset,
    batch_size=10,
    num_workers=1,
    shuffle=False,
    collate_fn=collate_fn
    )

8 sequences had no detections and will be excluded.
Excluded sequences: [6000161, 6000163, 6000293, 6000530, 6000691, 6000372, 6000953, 6000186]


In [19]:
channel_sum = torch.zeros(3)
pixel_count = 0

batches_flat = []

for batch in loader:
    for img in batch:
        pixel_count += img.shape[1] * img.shape[2]
        for c in range(img.shape[0]):
            channel_sum[c] += img[c].sum()

        batches_flat.append(img.flatten(start_dim=1))

mean = channel_sum / pixel_count

channel_diff_squared_sum = torch.zeros(3)
for batch in loader:
    for img in batch:
        img_centered_squared = (img - mean[:, None, None]) ** 2
        for c in range(img_centered_squared.shape[0]):
            channel_diff_squared_sum[c] += img_centered_squared[c].sum()

std = torch.sqrt(channel_diff_squared_sum / pixel_count)

print("Mean:", mean)
print("Std:", std)

Mean: tensor([0.3198, 0.2960, 0.2227])
Std: tensor([0.2246, 0.2083, 0.1730])


In [20]:
torch.cat(batches_flat, -1).mean(-1)

tensor([0.3198, 0.2960, 0.2227])

In [21]:
torch.cat(batches_flat, -1).std(-1)

tensor([0.2246, 0.2083, 0.1730])

In [2]:
path_to_dataset = paths['dataset']
path_labelfiles = paths['test_labels']
path_to_detector_output = paths['md_output']
detector_model=None
mode='init'

dataset = MammaliaData(
    path_labelfiles=path_labelfiles,
    path_to_dataset=path_to_dataset,
    path_to_detector_output=path_to_detector_output,
    detector_model=detector_model,
    mode=mode,
)

8 sequences had no detections and will be excluded.
Excluded sequences: [6000161, 6000163, 6000293, 6000530, 6000691, 6000372, 6000953, 6000186]


In [3]:
dataset.ds_filtered

Unnamed: 0,session,SerialNumber,seq_nr,seq_id,Directory,DateTime_start,DateTime_end,duration_seconds,first_file,last_file,n_files,all_files,label,duplicate_label,label2
0,4,H550HG09194945,233,4007156,sessions/session_04/W2-WK02,2020-06-09T23:21:46Z,2020-06-09T23:22:18Z,32.0,IMG_6154.JPG,IMG_6180.JPG,27,"IMG_6154.JPG,IMG_6155.JPG,IMG_6156.JPG,IMG_615...",apodemus_sp,False,apodemus_sp
1,4,H550HF07158832,180,4011466,sessions/session_04/W5-KH08,2020-06-28T22:25:32Z,2020-06-28T22:25:38Z,6.0,IMG_5446.JPG,IMG_5454.JPG,9,"IMG_5446.JPG,IMG_5447.JPG,IMG_5448.JPG,IMG_544...",apodemus_sp,False,apodemus_sp
2,1,H550HF08161305,229,1001887,sessions/session_01/H550HF08161305_2,2019-09-10T02:06:30Z,2019-09-10T02:07:31Z,61.0,IMG_3034.JPG,IMG_3051.JPG,18,"IMG_3034.JPG,IMG_3035.JPG,IMG_3036.JPG,IMG_303...",apodemus_sp,0.0,apodemus_sp
3,4,H550HF07158933,34,4010684,sessions/session_04/W4-WK02,2020-06-20T23:40:40Z,2020-06-20T23:42:00Z,80.0,IMG_0607.JPG,IMG_0654.JPG,48,"IMG_0607.JPG,IMG_0608.JPG,IMG_0609.JPG,IMG_061...",apodemus_sp,False,apodemus_sp
4,4,H,77,4000175,sessions/session_04/Testwoche1/KH08,2020-05-11T21:16:28Z,2020-05-11T21:16:30Z,2.0,RCNX1125.JPG,RCNX1127.JPG,3,"RCNX1125.JPG,RCNX1126.JPG,RCNX1127.JPG",apodemus_sp,False,apodemus_sp
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,4,H550HF07158839,222,4017495,sessions/session_04/W7-R25,2020-07-16T22:03:52Z,2020-07-16T22:03:52Z,0.0,IMG_3736.JPG,IMG_3738.JPG,3,"IMG_3736.JPG,IMG_3737.JPG,IMG_3738.JPG",sorex_sp,False,soricidae
156,4,H550HG09194886,174,4008312,sessions/session_04/W3-M7,2020-06-18T04:43:02Z,2020-06-18T04:43:02Z,0.0,IMG_3646.JPG,IMG_3648.JPG,3,"IMG_3646.JPG,IMG_3647.JPG,IMG_3648.JPG",crocidura_sp,False,soricidae
157,4,H550HG09194894,161,4015967,sessions/session_04/W6-R26,2020-07-11T22:25:42Z,2020-07-11T22:25:44Z,2.0,IMG_2272.JPG,IMG_2274.JPG,3,"IMG_2272.JPG,IMG_2273.JPG,IMG_2274.JPG",sorex_sp,False,soricidae
158,4,H550HF07158832,147,4014414,sessions/session_04/W6-M2,2020-07-05T05:16:24Z,2020-07-05T05:16:26Z,2.0,IMG_3880.JPG,IMG_3882.JPG,3,"IMG_3880.JPG,IMG_3881.JPG,IMG_3882.JPG",crocidura_sp,False,soricidae


In [8]:
ds = dataset.ds_filtered
test_size = 0.2
n_folds = 5
seed = 55


In [24]:
rng = np.random.default_rng(seed)

for value in ds['label2'].unique():
    ds_selected = ds[ds['label2'] == value]
    length = ds_selected.shape[0]
    indices = rng.permutation(length)

    labels = ds_selected['label2'].tolist()

    labels

    # n_files = ds_selected['n_files'].tolist()[indices]

    
    

In [56]:
ds_selected = ds[ds['label2'] == 'apodemus_sp']
length = ds_selected.shape[0]
indices = rng.permutation(length)

seq_ids = ds_selected['seq_id'].to_numpy()[indices]
seq_lengths = ds_selected['n_files'].to_numpy()[indices]

train_images = int(seq_lengths.sum() * test_size)
fold_images = int(seq_lengths.sum() * (1 - test_size)) // n_folds

split_sizes = [train_images] + [fold_images] * (n_folds)
splits = []
seq_ids_avail = seq_ids.copy()
seq_lengths_avail = seq_lengths.copy()

for i, size in enumerate(split_sizes):
    if len(seq_lengths_avail) == 0:
        break

    total_sum = seq_lengths_avail.sum()
    future_splits = len(split_sizes) - len(splits) - 1
    future_target = future_splits * fold_images

    best_cut = None
    best_error = float('inf')

    for idx in range(1, len(seq_lengths_avail)):
        left = seq_lengths_avail[:idx].sum()
        right = total_sum - left

        error = abs(left - size) + abs(right - future_target)

        if error < best_error:
            best_error = error
            best_cut = idx

    if best_cut is None:
        best_cut = len(seq_lengths_avail)  # last fallback

    splits.append(seq_ids_avail[:best_cut])
    seq_ids_avail = seq_ids_avail[best_cut:]
    seq_lengths_avail = seq_lengths_avail[best_cut:]


In [57]:
split_sizes

[124, 99, 99, 99, 99, 99]

In [58]:
for i in range(len(split_sizes)):    
    sum = 0
    for seq_id in splits[i]:
        n_files = ds_selected[ds_selected['seq_id'] == seq_id]['n_files'].values[0]

        sum += n_files
    print(sum)

123
108
90
105
93
84


In [21]:
ds_selected = ds

ds_selected['label2'].tolist()[indices]

TypeError: only integer scalar arrays can be converted to a scalar index