# 03 — Evaluate

Evaluate the base and fine-tuned DeepForest models with internal metrics and PR/F1 at a selected IoU.

Inputs
- `data/tiles/`
- `data/labels/df_labels_valid.csv`
- `data/labels/df_labels_test.csv`
- `models/deepforest_ft.pt`

Outputs
- Printed internal metrics (IoU, mAP, mAP@50/75)
- PR/F1 at IoU using the operating point max-F1 on VALID, reported on TEST
- (optional) quick prediction plots

Steps
1) Load base (and optional fine-tuned) model.
2) Run DeepForest validation to get internal metrics.
3) Sweep score threshold on VALID >> pick max-F1 (fixed NMS).
4) Report P/R/F1 on TEST using that threshold.

In [None]:
# Root and dependencies

import sys
from pathlib import Path

REPO_ROOT = Path.cwd().parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

import rasterio
from deepforest import main
import numpy as np
import torch
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import pandas as pd
from torchmetrics.detection.mean_ap import MeanAveragePrecision

from scripts.eval_metrics import load_sets, prf1_at_threshold
from scripts.vis_utils import plot_predictions

# paths
DATA = REPO_ROOT / "data"
TILES = DATA / "tiles"
LABELS = DATA / "labels"
MODELS = REPO_ROOT / "models"
VALID_CSV = LABELS / "df_labels_valid.csv"
TEST_CSV  = LABELS / "df_labels_test.csv"

print("REPO_ROOT:", REPO_ROOT)

## EVALUATE PERFORMANCE OF THE BASE MODEL

In [None]:
# load base model

m_base = main.deepforest()
m_base.load_model(model_name="weecology/deepforest-tree", revision="main")
m_base.model.eval()

In [None]:
# internal metrics

m_base.config["validation"]["csv_file"] = str(TEST_CSV)
m_base.config["validation"]["root_dir"] = str(TILES)

m_base.create_trainer()
results = m_base.trainer.validate(m_base)

print(results[0])  # dict with iou, map, map_50, map_75, losses, etc.

In [None]:
# Set thresholds and paths to compute max-F1 for both models (keep fixed to make a fair comparison between the models)

IOU_THR = 0.3
NMS = 0.05   # this is deepforest's retinanet internal default, 0.3-0.5 is common
THRS = np.linspace(0.0, 0.9, 91)

VAL_CSV, TEST_CSV, ROOT = Path(VALID_CSV), Path(TEST_CSV), Path(TILES)

In [None]:
# PR/F1 @ IoU with operating point = max-F1 calculated on VALID, applied to TEST

# pick threshold on validation set (common rule = max-F1 @ IoU)
GT_val, PRED_val = load_sets(m_base, VAL_CSV, ROOT, NMS)
best = max(((*prf1_at_threshold(GT_val, PRED_val, t, IOU_THR), t) for t in THRS),
           key=lambda x: x[2])
P_val, R_val, F1_val, THR_SEL = best
print(f"VAL @IoU={IOU_THR}: max-F1={F1_val:.3f} (thr={THR_SEL:.2f}, P={P_val:.3f}, R={R_val:.3f})")

# evaluate on test set using that threshold
GT_test, PRED_test = load_sets(m_base, TEST_CSV, ROOT, NMS)
P_t, R_t, F1_t = prf1_at_threshold(GT_test, PRED_test, THR_SEL, IOU_THR)
print(f"TEST @IoU={IOU_THR}: P={P_t:.3f}  R={R_t:.3f}  F1={F1_t:.3f}  (thr={THR_SEL:.2f}, NMS={NMS})")

In [None]:
# Plot image example

im = Image.open(TILES / "tile03.tif").convert("RGB")  # insert tile name
arr = np.array(im)
pred = m_base.predict_image(arr)
print(pred.head() if pred is not None else "No detections")

plot_predictions(arr, pred, title="Predictions - Base model")

## EVALUATE PERFORMANCE OF THE FINE TUNED MODEL

In [None]:
FT = REPO_ROOT/"models"/"deepforest_ft.pt"

m_ft = main.deepforest.load_from_checkpoint(str(FT))
m_ft.create_trainer()
m_ft.model.eval()

In [None]:
# internal metrics

m_ft.config["validation"]["csv_file"] = str(TEST_CSV)
m_ft.config["validation"]["root_dir"] = str(TILES)

m_ft.create_trainer()
results = m_ft.trainer.validate(m_ft)

print(results[0])

In [None]:
# PR/F1 @ IoU with operating point = max-F1 calculated on VALID, applied to TEST

# pick threshold on validation set (common rule = max-F1 @ IoU)
GT_val, PRED_val = load_sets(m_ft, VAL_CSV, ROOT, NMS)
best = max(((*prf1_at_threshold(GT_val, PRED_val, t, IOU_THR), t) for t in THRS),
           key=lambda x: x[2])
P_val, R_val, F1_val, THR_SEL = best
print(f"VAL @IoU={IOU_THR}: max-F1={F1_val:.3f} (thr={THR_SEL:.2f}, P={P_val:.3f}, R={R_val:.3f})")

# evaluate on test set using that threshold
GT_test, PRED_test = load_sets(m_ft, TEST_CSV, ROOT, NMS)
P_t, R_t, F1_t = prf1_at_threshold(GT_test, PRED_test, THR_SEL, IOU_THR)
print(f"TEST @IoU={IOU_THR}: P={P_t:.3f}  R={R_t:.3f}  F1={F1_t:.3f}  (thr={THR_SEL:.2f}, NMS={NMS})")

In [None]:
# Plot image example

im = Image.open(TILES / "tile03.tif").convert("RGB")  # insert tile name
arr = np.array(im)
pred = m_ft.predict_image(arr)
print(pred.head() if pred is not None else "No detections")

plot_predictions(arr, pred, title="Predictions - Fine tuned model")