In [1]:
import os
import warnings
import csv
import yaml
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Any, TypedDict

from megadetector.detection.run_detector import load_detector, model_string_to_model_version
from megadetector.detection.run_detector_batch import process_images, write_results_to_file

from os import PathLike
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split


In [2]:
def load_path_config(path_to_config):
    with open(path_to_config, 'r') as f:
        path_config = yaml.safe_load(f)
    return {k: Path(v) for k, v in path_config.items()}

paths = load_path_config('/cfs/earth/scratch/kraftjul/BA/code/path_config.yml')

In [3]:
class MegaDetectorRunner:
    """
    A class to run the MegaDetector model on images. Designed to be used on a set of image sequences,
    only loading the model once and running it on all sequences.

    Parameters
    ----------
    model_path : str | PathLike
        Path to the MegaDetector model file. Or a string representing the model version available online.
    confidence : float
        Confidence threshold for the model. Default is 0.25.
    """
    def __init__(
            self, 
            model_path: str | PathLike, 
            confidence: float = 0.25
            ):
        
        self.model = load_detector(str(model_path))
        self.confidence = confidence

    def run_on_images(
            self,
            images: list[PathLike],
            output_file_path: PathLike = None,
            ):

        results = process_images(
            im_files=images,
            detector=self.model,
            confidence_threshold=self.confidence,
            quiet=True
        )

        all_confidences = []

        for r in results:
            r["file"] = r["file"].name

            r["detections"] = [
                det for det in r.get("detections", [])
                if det["category"] == "1"
            ]
        
            all_confidences.extend(det["conf"] for det in r["detections"])

        all_confidences.sort(reverse=True)
        
        if output_file_path is not None:
            with open(output_file_path, "w") as f:
                json.dump(results, f, indent=2)

        return all_confidences      


In [12]:
class MammaliaData(Dataset):
    """
    A class to load and process the Mammalia dataset. It can be uset for the initial detection of the images
    utilizing the MegaDetector model, or for training a custom model for classification on the detected images.
    The dataset is divided into training and testing sets based on the sequence IDs.
    
    Parameters
    ----------
    path_labelfiles : str | PathLike
        Path to the directory containing the label files.
    path_to_dataset : str | PathLike
        Path to the main directory of the dataset, referenced in the labelfiles.
    path_to_detector_output : str | PathLike
        Path to the directory where the detector output is available for training or where the output will be saved
        if in detect mode.
    categories_to_drop : list[str], optional
        By default all non-empty labels are used. To drop certain labels from the dataset, provide a list of labels to drop.
        In detect mode, this parameter is ignored.
    detector_model : str
        Can be either a path to the model file or a string representing the model version available online.
        This parameter is only used in detect mode. It specifies the model version to be used for detection.
        The default is "MDV5A". The parameter is ignored in train and test mode.
    detection_confidence : float
        The detection in detection mode is done with a confidence of 0.25 by default. If for training or testing
        a higher confidence is needed, this parameter can be set to a higher value.
        The default is 0.25.
    sample_length : int
        For trainig this parameter specifies the range (1 - sample_length) of randomly seletded samples per sequence.
        For testing this parameter specifies the maximum number of samples per sequence.
        The default is 10.
    sample_img_size : [int, int]
        The size to which the detected areas are resized. The default is [224, 224].
    mode : str
        The mode in which the dataset is used. Can be either 'train', 'test' or 'detect'.
        In detect mode, the model is used to detect animals in the images. In train and test mode, the model is used
        to train or test a custom model for classification.
    """
    
    def __init__(
            self,
            path_labelfiles: str | PathLike,
            path_to_dataset: str | PathLike,
            path_to_detector_output: str | PathLike,
            categories_to_drop: list[str] = None,
            detector_model: str = "mdv5a",
            detection_confidence: float = 0.25,
            sample_length: int = 10,
            sample_img_size: [int, int] = [224, 224],
            mode: str = 'train',
            ):
        super().__init__()

        if mode in ['train', 'test', 'detect']:
            self.mode = mode
        else:
            raise ValueError("Please choose a mode from ['train', 'test', 'detect'].")
        
        if detection_confidence < 0.25:
            raise ValueError("Detection confidence must be at least 0.25.")
        self.detection_confidence = detection_confidence
        self.sample_length = sample_length
        self.sample_img_size = sample_img_size

        path_labelfiles = Path(path_labelfiles)
        if not path_labelfiles.exists():
            raise ValueError("The path to the label files does not exist.")
        self.path_labelfiles = path_labelfiles
        
        path_to_dataset = Path(path_to_dataset)
        if not path_to_dataset.exists():
            raise ValueError("The path to the dataset does not exist.")
        self.path_to_dataset = path_to_dataset
        
        path_to_detector_output = Path(path_to_detector_output)
        if self.mode != 'detect':
            if not path_to_detector_output.exists():
                raise ValueError("The path to the detector output does not exist. Detection output must be available for training.")
        else:    
            if not path_to_detector_output.exists():
                os.makedirs(path_to_detector_output)
            elif any(path_to_detector_output.iterdir()):
                raise ValueError("The path to the detector output contains files. Please clear or choose a different path.")
        self.path_to_detector_output = path_to_detector_output
        
        self.labelfiles = self.getting_all_files_of_type(self.path_labelfiles, file_type='.csv')

        if not self.mode == 'detect':
            self.categories_to_drop = categories_to_drop if categories_to_drop is not None else []
        else: 
            self.categories_to_drop = []

        self.ds_full = self.reading_all_metadata(
            list_of_files=self.labelfiles, 
            categories_to_drop=self.categories_to_drop
            )
        
        if self.ds_full['seq_id'].duplicated().any():
            duplicates = self.ds_full[self.ds_full['seq_id'].duplicated()]['seq_id'].tolist()
            raise ValueError(f"Duplicate seq_id(s) found in metadata: {duplicates[:5]} ...")
        
        if not self.mode == 'detect':
            train_seq_ids, test_seq_ids = train_test_split(
                self.ds_full['seq_id'],
                test_size=0.2,
                random_state=55,
                stratify=self.ds_full['label2']
                )
            
            if self.mode == 'train':
                active_seq_ids = train_seq_ids
            elif self.mode == 'test':
                active_seq_ids = test_seq_ids
            
            active_set = set(active_seq_ids)
            no_detected_set = set(self.get_seq_ids_with_no_detectection())

            excluded_seq_ids = list(active_set & no_detected_set)
            if excluded_seq_ids:
                warnings.warn(
                    f'{len(excluded_seq_ids)} sequences had no detections above {self.detection_confidence} confidence.\n'
                    f'Example: {excluded_seq_ids[:10]}',
                    UserWarning
                )

            active_seq_ids = list(active_set - no_detected_set)

        else:
            active_seq_ids = self.ds_full['seq_id'].unique().tolist()

        self.ds = self.ds_full[self.ds_full['seq_id'].isin(active_seq_ids)]
        self.seq_ids = self.ds['seq_id'].tolist()
        
        if mode == 'detect':
            self.detector_model = detector_model
            
            if self.detector_model not in model_string_to_model_version.keys():
                raise ValueError(f"The model {self.detector_model} is not supported. Please choose from {model_string_to_model_version.keys()}.")
            
            self.run_detector()
        


    def getting_all_files_of_type(
            self, 
            path: str | PathLike, 
            file_type: str = None, 
            get_full_path: bool = True
            ) -> list[str]:
        
        path = Path(path)
        files = []
        for file in os.listdir(path):
            if file_type is None or file.endswith(file_type):
                if get_full_path:
                    files.append(path / file)
                else:
                    files.append(file)
        return files
    
    def reading_all_metadata(
            self,
            list_of_files: list[PathLike],
            categories_to_drop: list[str]
            ) -> pd.DataFrame:
        
        metadata = pd.DataFrame()
        for file in list_of_files:
            metadata = pd.concat([metadata, pd.read_csv(file)], ignore_index=True)
            metadata = metadata.dropna(subset=['label2'])
            metadata = metadata[~metadata['label2'].isin(categories_to_drop)]
        return metadata
    
    def get_seq_ids_with_no_detectection(
            self,
            ) -> list[int]:
        detection_summary = pd.read_csv(
            self.path_to_detector_output / "detection_summary.csv",
            usecols=["seq_id", "max_conf"]
            )
        return detection_summary[detection_summary["max_conf"] < self.detection_confidence]["seq_id"].tolist()
    
    def get_all_images_of_sequence(
            self, 
            seq_id: int,
            )-> dict[str, PathLike]:
        image_dict = {}
        row = self.ds_full.loc[self.ds_full['seq_id'] == seq_id].squeeze()
        seq_path = Path(row['Directory'])
        all_files = row['all_files'].split(',')
        for file in all_files:
            image_dict[file] = self.path_to_dataset / seq_path / file
        return image_dict

    def run_detector(
            self,
            ) -> None:
        
        if self.mode != 'detect':
            raise ValueError("Only available if dataset is in detect mode.")
        
        runner = MegaDetectorRunner(
            model_path=self.detector_model,
            confidence=0.25
            )

        sequences = self.ds['seq_id'].unique().tolist()

        detection_rows = []

        for seq_id in sequences:
            seq_images = list(self.get_all_images_of_sequence(seq_id).values())
            output_file_path = self.path_to_detector_output / f"{seq_id}.json"
            detections = runner.run_on_images(
                images=seq_images,
                output_file_path=output_file_path
                )

            detection_row = {
                    "seq_id": seq_id,
                    "max_conf": max(detections) if len(detections) > 0 else 0,
                    "n_detections": len(detections),
                    "conf_list": json.dumps(detections)
                }
            
            detection_rows.append(detection_row)
        
        all_detections = pd.DataFrame(detection_rows, columns=["seq_id", "max_conf", "n_detections", "conf_list"])

        all_detections.to_csv(
            self.path_to_detector_output / "detection_summary.csv", 
            index=False,
            quoting=csv.QUOTE_NONNUMERIC
            )
            
    def getting_bb_list_for_seq(
            self,
            seq_id: int,
            confidence: float = None,
            ) -> list[dict]:
        
        if self.mode != 'detect':
            raise ValueError("Only available if dataset is in detect mode.")
        
        if confidence is None:
            confidence = self.detection_confidence

        path_to_detection_results = self.path_to_detector_output / f"{seq_id}.json"
        with open(path_to_detection_results, 'r') as f:
            data = json.load(f)

        bb_list = []

        for entry in data:
            file_name = entry['file']
            detections = entry.get('detections', [])

            for det in detections:
                if det['category'] == "1" and det['conf'] >= confidence:
                    bb_list({
                        'file': file_name,
                        'conf': det['conf'],
                        'bbox': det['bbox']
                    })
        
        bb_list = sorted(bb_list, key=lambda x: x['conf'], reverse=True)

        return bb_list

    def __len__(self) -> int:
        return len(self.ds)

    def __getitem__(self, index: int) -> Any: # still to be implemented
        seq_id = self.seq_ids[index]

        images = self.get_all_images_of_sequence(seq_id)
        bounding_boxes = self.getting_bb_list_for_seq(seq_id)


In [16]:
dataset = MammaliaData(
    path_to_dataset=paths['dataset'],
    path_labelfiles='/cfs/earth/scratch/kraftjul/BA/output/test_set/',
    path_to_detector_output='/cfs/earth/scratch/kraftjul/BA/output/test2_MD_out',
    mode='train',
    )

In [17]:
dataset.get_seq_ids_with_no_detectection()

[]

In [15]:
dataset.ds

Unnamed: 0,session,SerialNumber,seq_nr,seq_id,Directory,DateTime_start,DateTime_end,duration_seconds,first_file,last_file,n_files,all_files,label,duplicate_label,label2
1,1,H550HF07158873,1011,1001367,sessions/session_01/H550HF07158873_1,2019-09-24T23:49:08Z,2019-09-24T23:49:14Z,6.0,IMG_0382.JPG,IMG_0387.JPG,6,"IMG_0382.JPG,IMG_0383.JPG,IMG_0384.JPG,IMG_038...",apodemus_sp,0.0,apodemus_sp
2,4,H550HF07158878,58,4004783,sessions/session_04/W1-R23,2020-05-29T04:16:54Z,2020-05-29T04:17:20Z,26.0,IMG_0985.JPG,IMG_1005.JPG,21,"IMG_0985.JPG,IMG_0986.JPG,IMG_0987.JPG,IMG_098...",apodemus_sp,False,apodemus_sp
3,4,H550HF07158878,26,4004751,sessions/session_04/W1-R23,2020-05-29T01:31:18Z,2020-05-29T01:31:20Z,2.0,IMG_0295.JPG,IMG_0300.JPG,6,"IMG_0295.JPG,IMG_0296.JPG,IMG_0297.JPG,IMG_029...",apodemus_sp,False,apodemus_sp
4,4,H550HG09194894,118,4006753,sessions/session_04/W2-R74A,2020-06-08T22:51:50Z,2020-06-08T22:52:18Z,28.0,IMG_3175.JPG,IMG_3183.JPG,9,"IMG_3175.JPG,IMG_3176.JPG,IMG_3177.JPG,IMG_317...",apodemus_sp,False,apodemus_sp
5,4,H550HF07158873,32,4004487,sessions/session_04/W1-M7,2020-05-29T00:08:06Z,2020-05-29T00:08:16Z,10.0,IMG_0808.JPG,IMG_0819.JPG,12,"IMG_0808.JPG,IMG_0809.JPG,IMG_0810.JPG,IMG_081...",apodemus_sp,False,apodemus_sp
6,4,H550HF07158878,32,4000232,sessions/session_04/Testwoche1/R22,2020-05-08T01:04:28Z,2020-05-08T01:04:44Z,16.0,IMG_0304.JPG,IMG_0324.JPG,21,"IMG_0304.JPG,IMG_0305.JPG,IMG_0306.JPG,IMG_030...",apodemus_sp,False,apodemus_sp
9,4,H550HG09194886,11,4004905,sessions/session_04/W1-R25,2020-05-28T20:14:04Z,2020-05-28T20:14:08Z,4.0,IMG_0133.JPG,IMG_0138.JPG,6,"IMG_0133.JPG,IMG_0134.JPG,IMG_0135.JPG,IMG_013...",apodemus_sp,False,apodemus_sp
11,4,H550HF08161305,133,4006146,sessions/session_04/W2-R23,2020-06-13T11:26:40Z,2020-06-13T11:26:48Z,8.0,IMG_7219.JPG,IMG_7227.JPG,9,"IMG_7219.JPG,IMG_7220.JPG,IMG_7221.JPG,IMG_722...",myodes_glareolus,False,cricetidae
12,1,H550HF07158873,257,1000612,sessions/session_01/H550HF07158873_2,2019-09-04T02:24:31Z,2019-09-04T02:24:53Z,22.0,IMG_4123.JPG,IMG_4137.JPG,15,"IMG_4123.JPG,IMG_4124.JPG,IMG_4125.JPG,IMG_412...",myodes_glareolus,0.0,cricetidae
14,1,H550HF07158873,841,1001197,sessions/session_01/H550HF07158873_3,2019-09-16T01:12:23Z,2019-09-16T01:12:49Z,26.0,IMG_3568.JPG,IMG_3582.JPG,15,"IMG_3568.JPG,IMG_3569.JPG,IMG_3570.JPG,IMG_357...",myodes_glareolus,0.0,cricetidae


In [35]:
dataset.get_all_images_of_sequence(1000003).values()

dict_values([PosixPath('/cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_2/RCNX0021.JPG'), PosixPath('/cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_2/RCNX0022.JPG'), PosixPath('/cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_2/RCNX0023.JPG')])

In [47]:
dataset = MammaliaData(
    path_to_dataset=paths['dataset'],
    path_labelfiles='/cfs/earth/scratch/kraftjul/BA/output/test_set/',
    path_to_detector_output='/cfs/earth/scratch/kraftjul/BA/output/test2_MD_out',
    mode='detect',
    )

ValueError: The path to the detector output contains files. Please clear or choose a different path.

In [23]:
dataset.seq_ids

[1002472,
 1001367,
 4004783,
 4004751,
 4006753,
 4004487,
 4000232,
 4004905,
 1000883,
 4006146,
 1000612,
 1001197,
 4006473,
 1000441,
 4013208,
 4016418,
 6000305,
 6000652,
 6000414,
 6000794,
 6000402,
 6000301,
 1000003,
 6000358,
 4014783,
 4010560,
 4010914,
 4012988,
 4017512,
 1000210,
 4017527,
 4014264]

In [110]:
# extract list of all sequences from pd_dataframe


dataset.get_all_images_of_sequence(1002472)



[PosixPath('sessions/session_01/H550HF08161327_1/IMG_0187.JPG'),
 PosixPath('sessions/session_01/H550HF08161327_1/IMG_0188.JPG'),
 PosixPath('sessions/session_01/H550HF08161327_1/IMG_0189.JPG'),
 PosixPath('sessions/session_01/H550HF08161327_1/IMG_0190.JPG'),
 PosixPath('sessions/session_01/H550HF08161327_1/IMG_0191.JPG'),
 PosixPath('sessions/session_01/H550HF08161327_1/IMG_0192.JPG'),
 PosixPath('sessions/session_01/H550HF08161327_1/IMG_0193.JPG'),
 PosixPath('sessions/session_01/H550HF08161327_1/IMG_0194.JPG'),
 PosixPath('sessions/session_01/H550HF08161327_1/IMG_0195.JPG')]

In [3]:
# Pick a folder to run MD on recursively, and an output file
image_folder = os.path.expanduser('/cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_1')
output_file = os.path.expanduser('/cfs/earth/scratch/kraftjul/BA/output/megadetector_output_tes2.json')

# Recursively find images
image_file_names = path_utils.find_images(image_folder,recursive=True)

# This will automatically download MDv5a; you can also specify a filename.
results = load_and_run_detector_batch('MDV5A', image_file_names)

# Write results to a format that Timelapse and other downstream tools like.
write_results_to_file(results,
                      output_file,
                      relative_path_base=image_folder)


Bypassing download of already-downloaded file md_v5a.0.0.pt
Model v5a.0.0 available at /tmp/megadetector_models/md_v5a.0.0.pt
PyTorch reports 0 available CUDA devices
GPU available: False
Loading PT detector with compatibility mode classic


Fusing layers... 
Fusing layers... 
[W331 10:32:31.782568958 NNPACK.cpp:62] Could not initialize NNPACK! Reason: Unsupported hardware.
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs


Loaded model in 25.97 seconds


  0%|          | 0/3 [00:00<?, ?it/s]

Processing image /cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_1/RCNX0001.JPG


 33%|███▎      | 1/3 [00:01<00:02,  1.32s/it]

Processing image /cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_1/RCNX0002.JPG


 67%|██████▋   | 2/3 [00:02<00:01,  1.22s/it]

Processing image /cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_1/RCNX0003.JPG


100%|██████████| 3/3 [00:03<00:00,  1.22s/it]

Output file saved at /cfs/earth/scratch/kraftjul/BA/output/megadetector_output_tes2.json





{'images': [{'file': 'RCNX0001.JPG',
   'detections': [{'category': '1',
     'conf': 0.771,
     'bbox': [0.2236, 0.5555, 0.1225, 0.1402]},
    {'category': '2', 'conf': 0.184, 'bbox': [0.7783, 0.0215, 0.2216, 0.227]},
    {'category': '2', 'conf': 0.055, 'bbox': [0.0004, 0.0201, 0.2758, 0.3895]},
    {'category': '2',
     'conf': 0.019,
     'bbox': [0.7812, 0.0215, 0.2187, 0.8006]}]},
  {'file': 'RCNX0002.JPG',
   'detections': [{'category': '1',
     'conf': 0.487,
     'bbox': [0.2197, 0.5701, 0.185, 0.1354]},
    {'category': '2', 'conf': 0.087, 'bbox': [0.7802, 0.0215, 0.2197, 0.2187]},
    {'category': '2', 'conf': 0.084, 'bbox': [0.0004, 0.0208, 0.27, 0.3923]},
    {'category': '2', 'conf': 0.031, 'bbox': [0.0004, 0.0222, 0.267, 0.9569]},
    {'category': '2', 'conf': 0.019, 'bbox': [0.7822, 0.0215, 0.2177, 0.7951]},
    {'category': '3', 'conf': 0.015, 'bbox': [0.0, 0.0256, 1.0, 0.9673]},
    {'category': '1', 'conf': 0.005, 'bbox': [0.0078, 0.0222, 0.9916, 0.3201]},
    {'c

In [4]:
image_file_names

['/cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_1/RCNX0001.JPG',
 '/cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_1/RCNX0002.JPG',
 '/cfs/earth/scratch/iunr/shared/iunr-mammaliabox/dataset/sessions/session_01/H_1/RCNX0003.JPG']