# 00 — Data preparation

Prepare tiles and labels for DeepForest training.

Inputs
- `data/tiles/` — GeoTIFF tiles (same CRS). RGB or RGBA.
- `data/labels/BOXES.shp` — annotated boxes in the same CRS.
- `data/labels/splits.json` — filenames per split.

Outputs
- `data/labels/deepforest_labels.csv` (image_path,xmin,ymin,xmax,ymax,label)
- `data/labels/df_labels_train.csv`
- `data/labels/df_labels_valid.csv`
- `data/labels/df_labels_test.csv`

Steps
1) (optional) Convert RGBA to RGB in place.
2) (optional) Fit tiles to a selected reference tile.
3) Build DeepForest CSV from BOXES.shp (shapefile with annotated boxes).
4) Split CSV into train/val/test using splits.json (edit json file to update the split).

In [None]:
# Root and dependencies

import sys
from pathlib import Path

REPO_ROOT = Path.cwd().parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))
print("REPO_ROOT:", REPO_ROOT)

import rasterio
from rasterio.shutil import copy as rio_copy
import numpy as np
import geopandas as gpd
from shapely.geometry import box
import pandas as pd
import json

from scripts.rgba_to_rgb import convert_folder_inplace
from scripts.clip_tiles import fit_folder
from scripts.boxes_to_csv import build_deepforest_csv, split_labels

# paths

DATA = REPO_ROOT / "data"
TILES = DATA / "tiles"
LABELS = DATA / "labels"
MODELS = REPO_ROOT / "models"

In [None]:
# CONVERT RGBA TO RGB

stats = convert_folder_inplace(TILES)
print("Kept RGB, Converted, Other:", stats)

In [None]:
# CLIP OR PAD ALL TILES BASED ON A REFERENCE

REF = TILES / "tile01.tif"      # select reference tile 

fit_folder(ref_path=REF, dir_path=TILES, padval=0)
print("Done:", TILES)

In [None]:
# CREATE CSVs IN DEEPFOREST FORMAT

VECT_PATH = LABELS / "BOXES.shp"
OUT_CSV = LABELS / "deepforest_labels.csv"
SPLITS_JSON = LABELS / "splits.json"

# create deepforest_labels.csv
build_deepforest_csv(tiles_dir=TILES, vect_path=VECT_PATH, out_csv=OUT_CSV)

# create df_labels_train/valid/test.csv based on splits.json
split_labels(labels_dir=LABELS, base_csv=OUT_CSV, splits_json=SPLITS_JSON)