In [15]:
import geopandas as gpd 

In [17]:
tiles = gpd.read_file("../data/tiles.geojson")
trees_data = gpd.read_file("../data/trees_box.geojson")

In [18]:
tiles.to_crs(epsg=4326, inplace=True)
trees_data.to_crs(epsg=4326, inplace=True)

Unnamed: 0,id,title,geometry
0,"Tile(x=6773, y=293571, z=19)","XYZ tile Tile(x=6773, y=293571, z=19)","POLYGON ((-175.34935 -21.08706, -175.34935 -21..."
1,"Tile(x=6773, y=293572, z=19)","XYZ tile Tile(x=6773, y=293572, z=19)","POLYGON ((-175.34935 -21.0877, -175.34935 -21...."
2,"Tile(x=6773, y=293573, z=19)","XYZ tile Tile(x=6773, y=293573, z=19)","POLYGON ((-175.34935 -21.08834, -175.34935 -21..."
3,"Tile(x=6773, y=293574, z=19)","XYZ tile Tile(x=6773, y=293574, z=19)","POLYGON ((-175.34935 -21.08898, -175.34935 -21..."
4,"Tile(x=6773, y=293575, z=19)","XYZ tile Tile(x=6773, y=293575, z=19)","POLYGON ((-175.34935 -21.08963, -175.34935 -21..."
...,...,...,...
546,"Tile(x=6791, y=293595, z=19)","XYZ tile Tile(x=6791, y=293595, z=19)","POLYGON ((-175.33699 -21.10244, -175.33699 -21..."
547,"Tile(x=6791, y=293596, z=19)","XYZ tile Tile(x=6791, y=293596, z=19)","POLYGON ((-175.33699 -21.10308, -175.33699 -21..."
548,"Tile(x=6791, y=293597, z=19)","XYZ tile Tile(x=6791, y=293597, z=19)","POLYGON ((-175.33699 -21.10372, -175.33699 -21..."
549,"Tile(x=6791, y=293598, z=19)","XYZ tile Tile(x=6791, y=293598, z=19)","POLYGON ((-175.33699 -21.10436, -175.33699 -21..."


In [35]:
import re

def parse_tile_id(tile_id_str):
    match = re.match(r"Tile\(x=(\d+), y=(\d+), z=(\d+)\)", tile_id_str)
    if match:
        return match.groups()
    raise ValueError(f"Cannot parse tile ID: {tile_id_str}")

### converts it to geojson labels for each tiles

In [38]:
import os
from pathlib import Path

def split_geojson_by_tiles(trees_gdf, tiles_gdf, output_dir, prefix="OAM"):
    """Clip trees by tiles and save as individual GeoJSON files"""
    os.makedirs(output_dir, exist_ok=True)
    
    stats = {'processed': 0, 'skipped': 0, 'errors': 0, 'total_trees': 0}
    
    for idx, tile in tiles_gdf.iterrows():
        try:
            tile_geom = tile.geometry
            tile_id = tile['id']
            
            x, y, z = parse_tile_id(tile_id)
            tile_filename = f"{prefix}-{x}-{y}-{z}.geojson"
            
            intersecting_trees = trees_gdf[trees_gdf.intersects(tile_geom)].copy()
            
            if intersecting_trees.empty:
                stats['skipped'] += 1
                continue
            
            clipped_trees = gpd.clip(intersecting_trees, tile_geom)
            
            output_path = Path(output_dir) / tile_filename
            clipped_trees.to_file(output_path, driver="GeoJSON")
            
            stats['processed'] += 1
            stats['total_trees'] += len(clipped_trees)
            
        except Exception as e:
            print(f"Error processing tile {idx}: {e}")
            stats['errors'] += 1
            continue
    
    return stats

In [39]:
stats = split_geojson_by_tiles(trees_data, tiles, "../data/labels")

In [40]:
stats

{'processed': 458, 'skipped': 93, 'errors': 0, 'total_trees': 12337}

#### geojson Labels to YOLO Format

In [60]:
from glob import glob
import rasterio
import yaml

chips_dir = Path("../data/chips")
labels_dir = Path("../data/labels")
yolo_dir = Path("../data/yolo")
yolo_dir.mkdir(exist_ok=True)

yolo_labels_dir = Path(os.path.join(yolo_dir, "labels"))

yolo_labels_dir.mkdir(exist_ok=True)

label_files = sorted(labels_dir.glob("*.geojson"))


In [61]:
len(label_files)

458

In [62]:
classes = sorted(trees_data['species_mapped'].unique())
class_to_id = {cls: idx for idx, cls in enumerate(classes)}

print("Class mapping:")
for cls, idx in class_to_id.items():
    print(f"  {idx}: {cls}")

Class mapping:
  0: Banana
  1: Coconut
  2: Mango
  3: Papaya


In [63]:
def geojson_to_yolo(geojson_path, image_path, class_mapping):
    """Convert GeoJSON to YOLO format using image georeferencing"""
    trees = gpd.read_file(geojson_path)
    
    with rasterio.open(image_path) as src:
        bounds = src.bounds
    
    tile_minx, tile_miny, tile_maxx, tile_maxy = bounds
    tile_width = tile_maxx - tile_minx
    tile_height = tile_maxy - tile_miny
    
    yolo_lines = []
    for _, tree in trees.iterrows():
        species = tree.get('species_mapped', 'Unknown')
        class_id = class_mapping.get(species, 0)
        
        tree_bounds = tree.geometry.bounds
        tree_minx, tree_miny, tree_maxx, tree_maxy = tree_bounds
        
        center_x = (tree_minx + tree_maxx) / 2
        center_y = (tree_miny + tree_maxy) / 2
        box_width = tree_maxx - tree_minx
        box_height = tree_maxy - tree_miny
        
        x_center_norm = (center_x - tile_minx) / tile_width
        y_center_norm = (center_y - tile_miny) / tile_height
        width_norm = box_width / tile_width
        height_norm = box_height / tile_height
        
        x_center_norm = max(0, min(1, x_center_norm))
        y_center_norm = max(0, min(1, y_center_norm))
        width_norm = max(0, min(1, width_norm))
        height_norm = max(0, min(1, height_norm))
        
        yolo_lines.append(f"{class_id} {x_center_norm:.6f} {y_center_norm:.6f} {width_norm:.6f} {height_norm:.6f}")
    
    return yolo_lines

In [64]:
converted = 0
skipped = 0

for label_file in label_files:
    stem = label_file.stem
    image_file = chips_dir / f"{stem}.tif"
    
    if not image_file.exists():
        skipped += 1
        continue
    
    yolo_lines = geojson_to_yolo(label_file, image_file, class_to_id)
    
    yolo_file = yolo_labels_dir / f"{stem}.txt"
    with open(yolo_file, 'w') as f:
        f.write('\n'.join(yolo_lines))
    
    converted += 1

In [65]:
data_config = {
    'path': str(Path('../data').absolute()),
    'train': 'chips',
    'val': 'chips',
    'names': {idx: name for name, idx in class_to_id.items()}
}

config_file = Path("../data/yolo/config.yaml")
with open(config_file, 'w') as f:
    yaml.dump(data_config, f, sort_keys=False)
