# DETR Smoke Detection Training on Pyronear Dataset (Colab Ready)

This notebook trains a [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) model on the Pyronear smoke dataset, downloading data directly from Hugging Face. It uses PyTorch Lightning and torchmetrics to provide detailed YOLO-style metrics (mAP@0.5, mAP@0.5:0.95, precision, recall, etc.) during training.


## 📦 Setup and Install Dependencies

In [1]:
!pip install -q torch torchvision pytorch-lightning torchmetrics transformers datasets huggingface_hub pycocotools opencv-python

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## 📚 Imports

In [2]:
import os
import json
from PIL import Image
import torch
from torchvision.datasets import CocoDetection
from torchvision import transforms
from torch.utils.data import DataLoader
from transformers import DetrImageProcessor, DetrForObjectDetection
import matplotlib.pyplot as plt
from datasets import load_dataset
import pytorch_lightning as pl
from torchmetrics.detection.mean_ap import MeanAveragePrecision


## 📥 Download Pyronear Dataset from Hugging Face
We use the [datasets](https://huggingface.co/docs/datasets) library to download the Pyronear smoke dataset.

In [3]:
# Change to the correct dataset repo if needed

dataset = load_dataset('pyronear/pyro-sdis', split='train')
# If the dataset is not in COCO format, conversion logic will be added below.

README.md:   0%|          | 0.00/7.37k [00:00<?, ?B/s]

train-00000-of-00006.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

train-00001-of-00006.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

train-00002-of-00006.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

train-00003-of-00006.parquet:   0%|          | 0.00/483M [00:00<?, ?B/s]

train-00004-of-00006.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]

train-00005-of-00006.parquet:   0%|          | 0.00/483M [00:00<?, ?B/s]

val-00000-of-00001.parquet:   0%|          | 0.00/390M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29537 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/4099 [00:00<?, ? examples/s]

In [4]:
val_dataset = load_dataset('pyronear/pyro-sdis', split='val')

In [6]:
from google.colab import drive
import os
drive.mount('/content/drive')
CHECKPOINT_DIR = '/content/drive/MyDrive/detr_checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)


Mounted at /content/drive


## 🔁 Convert YOLO to COCO format (if needed)
If the dataset is in YOLO format, convert it to COCO format for DETR compatibility.

In [7]:
 import json
from tqdm import tqdm

def convert_yolo_to_coco(dataset, class_list):
    coco_dict = {
        "images": [],
        "annotations": [],
        "categories": []
    }
    annotation_id = 1
    for idx, data in enumerate(tqdm(dataset)):
        image_id = idx + 1
        img = data['image']
        width, height = img.size
        coco_dict["images"].append({
            "id": image_id,
            "file_name": data['image_name'],
            "width": width,
            "height": height
        })
        # Handle multiple annotations per image if needed
        annos = data['annotations'].strip().split('\n')
        for anno in annos:
            parts = anno.strip().split()
            if len(parts) != 5:
                continue  # skip malformed lines
            class_id, x_center, y_center, w, h = map(float, parts)
            # Convert YOLO to COCO bbox
            x = (x_center - w/2) * width
            y = (y_center - h/2) * height
            w_box = w * width
            h_box = h * height
            coco_dict["annotations"].append({
                "id": annotation_id,
                "image_id": image_id,
                "category_id": int(class_id) + 1,  # COCO ids start at 1
                "bbox": [x, y, w_box, h_box],
                "area": w_box * h_box,
                "iscrowd": 0
            })
            annotation_id += 1
    # Add categories
    for i, name in enumerate(class_list):
        coco_dict["categories"].append({
            "id": i + 1,
            "name": name
        })
    return coco_dict

# Example usage:
class_list = ["smoke"]  # Update if you have more classes
coco_dict = convert_yolo_to_coco(dataset, class_list)


# Save to file if needed:
with open("annotations.json", "w") as f:
    json.dump(coco_dict, f)

100%|██████████| 29537/29537 [02:04<00:00, 237.79it/s]


In [8]:
coco_dict_val = convert_yolo_to_coco(val_dataset, class_list=class_list)
# Save to file if needed:
with open("annotations.json", "w") as f:
    json.dump(coco_dict_val, f)

100%|██████████| 4099/4099 [00:16<00:00, 245.40it/s]


## 🗂️ Prepare DataLoaders
Wrap the COCO dataset for use with PyTorch Lightning.

In [9]:
from torch.utils.data import Dataset, DataLoader

class InMemoryCocoDataset(Dataset):
    def __init__(self, dataset, coco_dict, transform=None):
        self.dataset = dataset
        self.coco_dict = coco_dict
        self.transform = transform
        # Map image_id to annotations
        self.ann_map = {}
        for ann in coco_dict['annotations']:
            self.ann_map.setdefault(ann['image_id'], []).append(ann)
        # List of images
        self.images = coco_dict['images']

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_info = self.images[idx]
        image_id = img_info['id']
        img = self.dataset[idx]['image']
        anns = self.ann_map.get(image_id, [])
        if len(anns) > 0:
            boxes = torch.tensor([ann['bbox'] for ann in anns], dtype=torch.float32)
            labels = torch.tensor([ann['category_id'] for ann in anns], dtype=torch.int64)
        else:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
        target = {
            'boxes': boxes,
            'labels': labels
        }
        if self.transform:
            img = self.transform(img)
        return img, target

# Usage:
transform = transforms.Compose([transforms.ToTensor()])
custom_dataset = InMemoryCocoDataset(dataset, coco_dict, transform=transform)
dataloader = DataLoader(custom_dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=lambda x: tuple(zip(*x)))

custom_dataset_val = InMemoryCocoDataset(val_dataset, coco_dict_val, transform=transform)
val_dataloader = DataLoader(custom_dataset_val, batch_size=4, shuffle=True, num_workers=2, collate_fn=lambda x: tuple(zip(*x)))

## ⚡ PyTorch Lightning Module with YOLO-style Metrics
We use torchmetrics' MeanAveragePrecision for mAP@0.5, mAP@0.5:0.95, precision, recall, etc.

In [10]:
from transformers import DetrForObjectDetection, DetrConfig, DetrImageProcessor
import torch
import pytorch_lightning as pl
from torchmetrics.detection.mean_ap import MeanAveragePrecision

from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    dirpath=CHECKPOINT_DIR,                   # Save into your Google Drive
    filename='detr-{epoch:02d}-{val_map_epoch:.2f}',  # filename format
    monitor='val_map_epoch',                   # OR 'val_loss' if you prefer
    mode='max',                                # 'max' for mAP, 'min' for loss
    save_top_k=1,                              # Only keep the best checkpoint
    verbose=True
)


class DETRLightningModule(pl.LightningModule):
    def __init__(self, num_classes):
        super().__init__()
        # Load config and set num_labels
        config = DetrConfig.from_pretrained('facebook/detr-resnet-50')
        config.num_labels = num_classes
        self.model = DetrForObjectDetection.from_pretrained(
            'facebook/detr-resnet-50', config=config, ignore_mismatched_sizes=True
        )
        self.processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50')
        self.map_metric = MeanAveragePrecision(class_metrics=True)

    def forward(self, pixel_values):
        return self.model(pixel_values)

    def training_step(self, batch, batch_idx):
        images, targets = batch
        encoding = self.processor(images, return_tensors="pt").to(self.device)
        labels = [{"class_labels": t['labels'], "boxes": t['boxes']} for t in targets]
        outputs = self.model(**encoding, labels=labels)
        loss = outputs.loss

        # Handle target sizes for PIL or tensor images
        if isinstance(images[0], torch.Tensor):
            target_sizes = torch.stack([torch.tensor(img.shape[-2:]) for img in images]).to(self.device)
        else:
            target_sizes = torch.tensor([img.size[::-1] for img in images]).to(self.device)

        results = self.processor.post_process_object_detection(
            outputs, target_sizes=target_sizes, threshold=0.5
        )

        # Move predictions to device
        for r in results:
            r["boxes"] = r["boxes"].to(self.device)
            r["labels"] = r["labels"].to(self.device)
            r["scores"] = r["scores"].to(self.device)

        # Move targets to device
        formatted_targets = []
        for t in targets:
            formatted_targets.append({
                "boxes": t["boxes"].to(self.device),
                "labels": t["labels"].to(self.device)
            })

        self.map_metric.update(results, formatted_targets)
        metrics = self.map_metric.compute()
        # Safely format metrics for logging
        scalar_metrics = {}
        for k, v in metrics.items():
            if isinstance(v, torch.Tensor):
                if v.numel() == 1:
                    scalar_metrics[k] = v.item()
                else:
                    # For things like map_per_class and mar_100_per_class, take the mean, ignoring -1 (invalid classes)
                    if (k.endswith('per_class')):
                        v = v[v != -1]  # Ignore invalid -1 values
                        if v.numel() > 0:
                            scalar_metrics[k + '_mean'] = v.mean().item()
            elif isinstance(v, (float, int)):
                scalar_metrics[k] = v
        self.log_dict(scalar_metrics, prog_bar=True, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        images, targets = batch
        encoding = self.processor(images, return_tensors="pt").to(self.device)
        labels = [{"class_labels": t['labels'], "boxes": t['boxes']} for t in targets]
        outputs = self.model(**encoding, labels=labels)
        loss = outputs.loss

        # Handle target sizes for PIL or tensor images
        if isinstance(images[0], torch.Tensor):
            target_sizes = torch.stack([torch.tensor(img.shape[-2:]) for img in images]).to(self.device)
        else:
            target_sizes = torch.tensor([img.size[::-1] for img in images]).to(self.device)

        results = self.processor.post_process_object_detection(
            outputs, target_sizes=target_sizes, threshold=0.5
        )

        # Move predictions to device
        for r in results:
            r["boxes"] = r["boxes"].to(self.device)
            r["labels"] = r["labels"].to(self.device)
            r["scores"] = r["scores"].to(self.device)

        # Move targets to device
        formatted_targets = []
        for t in targets:
            formatted_targets.append({
                "boxes": t["boxes"].to(self.device),
                "labels": t["labels"].to(self.device)
            })

        self.map_metric.update(results, formatted_targets)
        metrics = self.map_metric.compute()
        # Safely format metrics for logging
        scalar_metrics = {}
        for k, v in metrics.items():
            if isinstance(v, torch.Tensor):
                if v.numel() == 1:
                    scalar_metrics[k] = v.item()
                else:
                    # For things like map_per_class and mar_100_per_class, take the mean, ignoring -1 (invalid classes)
                    if (k.endswith('per_class')):
                        v = v[v != -1]  # Ignore invalid -1 values
                        if v.numel() > 0:
                            scalar_metrics[k + '_mean'] = v.mean().item()
            elif isinstance(v, (float, int)):
                scalar_metrics[k] = v
        self.log('val_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-4)

    def on_train_epoch_end(self):
      train_metrics = self.map_metric.compute()
      self.log_dict({f'train_{k}': v for k, v in train_metrics.items()}, prog_bar=True)
      self.map_metric.reset()

    def on_validation_epoch_end(self):
      val_metrics = self.map_metric.compute()
      self.log_dict({f'val_{k}': v for k, v in val_metrics.items()}, prog_bar=True)
      self.map_metric.reset()




## 🚂 Train the Model
Set up the PyTorch Lightning Trainer and start training.

In [11]:
print("Train split length:", len(dataset))
print("Train COCO images:", len(coco_dict['images']))
print("Val split length:", len(val_dataset))
print("Val COCO images:", len(coco_dict_val['images']))

Train split length: 29537
Train COCO images: 29537
Val split length: 4099
Val COCO images: 4099


In [None]:
# Example usage (replace with your actual dataloader and num_classes)
model = DETRLightningModule(num_classes=2)
trainer = pl.Trainer(max_epochs=10, accelerator='gpu' if torch.cuda.is_available() else 'cpu',callbacks=[checkpoint_callback])
trainer.fit(model,dataloader, val_dataloader)


config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model checkpoin

preprocessor_config.json:   0%|          | 0.00/290 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type                   | Params | Mode 
--------------------------------------------------------------
0 | model      | DetrForObjectDetection | 41.5 M | eval 
1 | map_metric | MeanAveragePrecision   | 0      | train
--------------------------------------------------------------
41.3 M    Trainable params
222 K     Non-trainable params
41.5 M    Total params
166.008   Total estimated model params size (MB)
1         Modules in train mode
399       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 3. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Training: |          | 0/? [00:00<?, ?it/s]

## 📊 Visualize Metrics and Predictions
Plot or print the metrics after each epoch, and visualize sample predictions.

In [None]:
# 📊 Plot mAP, Precision, and Recall per epoch from Lightning logs

import matplotlib.pyplot as plt

def plot_metrics(trainer):
    # Extract metrics from the trainer's logger
    metrics = trainer.callback_metrics
    epochs = range(1, trainer.current_epoch + 2)

    # These keys may differ depending on your metric names/logs
    mAP_50 = [metrics.get(f"map_50_epoch_{e}", None) for e in range(trainer.current_epoch + 1)]
    mAP_95 = [metrics.get(f"map_epoch_{e}", None) for e in range(trainer.current_epoch + 1)]
    precision = [metrics.get(f"precision_epoch_{e}", None) for e in range(trainer.current_epoch + 1)]
    recall = [metrics.get(f"recall_epoch_{e}", None) for e in range(trainer.current_epoch + 1)]

    plt.figure(figsize=(10, 6))
    plt.plot(epochs, mAP_50, label="mAP@0.5")
    plt.plot(epochs, mAP_95, label="mAP@0.5:0.95")
    plt.plot(epochs, precision, label="Precision")
    plt.plot(epochs, recall, label="Recall")
    plt.xlabel("Epoch")
    plt.ylabel("Metric Value")
    plt.title("Detection Metrics per Epoch")
    plt.legend()
    plt.grid()
    plt.show()

# Usage example (run after training):
# plot_metrics(trainer)


In [None]:
# 🖼️ Visualize sample predictions from the trained model

import torch
import matplotlib.pyplot as plt
import numpy as np

def plot_predictions(model, processor, dataloader, device, class_names, num_images=4, score_threshold=0.5):
    model.eval()
    images_shown = 0

    for images, targets in dataloader:
        # Move images to device
        pixel_values = processor(images, return_tensors="pt", padding=True).pixel_values.to(device)
        with torch.no_grad():
            outputs = model(pixel_values=pixel_values)

        # Post-process outputs
        results = processor.post_process_object_detection(outputs, target_sizes=[img.size[::-1] for img in images], threshold=score_threshold)

        for idx, (image, result) in enumerate(zip(images, results)):
            plt.figure(figsize=(8, 6))
            plt.imshow(image)
            ax = plt.gca()
            boxes = result["boxes"].cpu().numpy()
            scores = result["scores"].cpu().numpy()
            labels = result["labels"].cpu().numpy()

            for box, score, label in zip(boxes, scores, labels):
                xmin, ymin, xmax, ymax = box
                ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, color='red', linewidth=2))
                ax.text(xmin, ymin, f'{class_names[label]}: {score:.2f}', bbox=dict(facecolor='yellow', alpha=0.5), fontsize=10, color='black')
            plt.axis('off')
            plt.show()
            images_shown += 1
            if images_shown >= num_images:
                return

# Usage example (after training):
# class_names = ["background", "smoke"]  # adjust as needed
# plot_predictions(model.model, model.processor, val_dataloader, device="cuda" if torch.cuda.is_available() else "cpu", class_names=class_names)