In [None]:
# BLANK

import pathlib
from datetime import datetime
import time

import torch
from torch import cuda

import glob
import json
import os
import shutil
import sys
import ast
import random
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
import skimage.io as io
from shapely import Polygon
from matplotlib import pyplot as plt
from PIL import Image
from torch import cuda
import supervision as sv
from ultralytics import YOLO
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm, trange
from importlib import reload

# Clone FTCNN repo here: https://www.github.com/joeletho/FTCNN.git

# Cloned repo directory
sys.path.append("path/to/ftcnn")

import ftcnn


In [None]:
# BLANK

print(sys.version)

In [None]:
# BLANK

print(reload(ftcnn))

In [None]:
# BLANK

has_gpu = cuda.is_available()

device = torch.device('cuda' if has_gpu else 'cpu')
print(device)
if has_gpu:
    print(cuda.get_device_name(0))


#### Example directory structure:
```
Root
  ├── FTCNN_YOLO
  |         ├── datasets
  |         ├── models
  ├── NDVI
  ├── QGIS
  ├── Readme.txt
  ├── Shapefiles
  ...
```

In [None]:
# BLANK
path_map = {}

# Change this to your project path
path_map['ROOT'] = Path("path/to/project/root")

path_map['PROJECT_NAME'] = 'FTCNN_YOLO'
path_map['FTCNN'] = path_map['ROOT'] / path_map["PROJECT_NAME"]
path_map['FTCNN_DS'] = path_map['FTCNN'] / 'datasets'
path_map['FTCNN_MODELS'] = path_map['FTCNN'] / 'models'

In [None]:
# BLANK
def make_directories(paths_map, verbose=True, exist_ok=False):
    if verbose:
        print("Creating directory structure")
    for name, path in paths_map.items():
        if isinstance(path, Path):
            if path.is_file():
                paths_map[name] = path.resolve()
            else:
                path = path.resolve()
                paths_map[name] = path
                path.mkdir(parents=True, exist_ok=exist_ok)
                if verbose:
                    print('  ',path)
    if verbose:
        print("Complete")

def make_project_paths(root,*, verbose=True, exist_ok=False):    
    paths = {'NDVI': Path(root, 'NDVI', 'NDVI Difference Rasters')}
    paths['SHAPE_FILES'] = Path(root, 'Shapefiles')

    paths['FTCNN_DS_META'] = path_map['FTCNN_DS'] / 'meta'
    paths['FTCNN_DS_CSV'] = paths['FTCNN_DS_META'] / 'csv'
    paths['FTCNN_DS_SHP'] = paths['FTCNN_DS_META'] / 'shp'
    
    # Data
    paths['PRED_SHP'] = paths['SHAPE_FILES'] / 'ModelPredictions'
    paths['SHPZ10_SHP'] = paths['SHAPE_FILES'] / 'Treatments_UTMz10_Only_08-18-24' / 'Treatments_UTMz10_Only_08-18-24.shp'
    paths['SHPZ11_SHP'] = paths['SHAPE_FILES'] / 'Treatments_UTMz11_Only_08-18-24' / 'Treatments_UTMz11_Only_08-18-24.shp'
    for name, path in paths.items():
        path_map[name] = path
    make_directories(paths, verbose=verbose, exist_ok=exist_ok)

        
def make_dataset_paths(ds_root, models_root, model_name, *,verbose=True, exist_ok=False):
    ds_root = Path(ds_root)
    models_root = Path(models_root)
    paths = {}
        
    paths['MODEL_NAME'] = model_name
    paths['FTCNN_MODEL'] = models_root / paths['MODEL_NAME']
    paths['FTCNN_DS_MODEL'] = ds_root / paths['MODEL_NAME']
    paths['FTCNN_DS_MODEL_META'] = paths['FTCNN_DS_MODEL'] / 'meta'
    paths['FTCNN_DS_MODEL_SHP'] = paths['FTCNN_DS_MODEL_META'] / 'shp'
    paths['FTCNN_DS_MODEL_CSV'] = paths['FTCNN_DS_MODEL_META'] / 'csv'
    
    paths['FTCNN_DS_DATA'] = paths['FTCNN_DS_MODEL'] / 'meta'
    paths['FTCNN_DS_CONFIG_FILE'] = paths['FTCNN_DS_MODEL'] / 'config' / 'data.yaml'
    paths['FTCNN_DS_YOLO_DATA_FILE'] = paths['FTCNN_DS_DATA'] / 'yolo_ndvi_ds.csv'
    
    # Images and labels
    paths['FTCNN_DS_IMAGES'] = paths['FTCNN_DS_MODEL'] / 'images'
    paths['FTCNN_DS_LABELS'] = paths['FTCNN_DS_MODEL'] / 'labels'
    paths['FTCNN_DS_LABELS_GENERATED'] = paths['FTCNN_DS_LABELS'] / 'generated'
    
    paths['FTCNN_DS_CHIPS'] = paths["FTCNN_DS_IMAGES"] / 'chips'
    paths['FTCNN_DS_PNGS'] = paths["FTCNN_DS_IMAGES"] / 'png'
    paths['FTCNN_DS_TIFS'] = paths["FTCNN_DS_IMAGES"] / 'tif'
    
    paths['FTCNN_DS_IMAGES_TRAIN'] = paths['FTCNN_DS_IMAGES'] / 'train'
    paths['FTCNN_DS_IMAGES_TEST'] = paths['FTCNN_DS_IMAGES'] / 'test'
    paths['FTCNN_DS_IMAGES_VAL'] = paths['FTCNN_DS_IMAGES'] / 'val'
    
    paths['FTCNN_DS_LABELS_TRAIN'] = paths['FTCNN_DS_LABELS'] / 'train'
    paths['FTCNN_DS_LABELS_TEST'] = paths['FTCNN_DS_LABELS'] / 'test'
    paths['FTCNN_DS_LABELS_VAL'] = paths['FTCNN_DS_LABELS'] / 'val'

    # Metadata

    # Zone 10
    paths['CSVZ10'] = paths['FTCNN_DS_DATA'] / 'Treatments_UTMz10.csv'
    paths['CSVZ10_NORM'] = paths['FTCNN_DS_DATA'] / 'Treatments_UTMz10_normalized.csv'
    paths['CSVZ10_CLEANED'] = paths['FTCNN_DS_DATA'] / 'Treatments_UTMz10_normalized_cleaned.csv'
    paths['CSVZ10_CHIPPED'] = paths['FTCNN_DS_DATA'] / 'Treatments_UTMz10_normalized_chipped.csv'
    paths['CSVZ10_CHIP_LABELS_UTM'] = paths['FTCNN_DS_DATA'] / 'Treatments_z10utm_chip_labels.csv'
    paths['CSVZ10_CHIP_LABELS_PIXEL'] = paths['FTCNN_DS_DATA'] / 'Treatments_z10pixel_chip_labels.csv'
    paths['CSVZ10_CHIP_LABELS_PIXEL_ENCODED'] = paths['FTCNN_DS_DATA'] / 'Treatments_z10pixel_chip_labels_encoded.csv'
    paths['CSVZ10_CHIP_LABELS_PREYOLO'] = paths['FTCNN_DS_DATA'] / 'Treatments_z10pixel_chip_labels_encoded_preyolo.csv'
    
    # Zone 11
    paths['CSVZ11'] = paths['FTCNN_DS_DATA'] / 'Treatments_UTMz11.csv'
    paths['CSVZ11_NORM'] = paths['FTCNN_DS_DATA'] / 'Treatments_UTMz11_normalized.csv'
    paths['CSVZ11_CLEANED'] = paths['FTCNN_DS_DATA'] / 'Treatments_UTMz11_normalized_cleaned.csv'
    paths['CSVZ11_CHIPPED'] = paths['FTCNN_DS_DATA'] / 'Treatments_UTMz11_normalized_chipped.csv'
    paths['CSVZ11_CHIP_LABELS_UTM'] = paths['FTCNN_DS_DATA'] / 'Treatments_z11utm_chip_labels.csv'
    paths['CSVZ11_CHIP_LABELS_PIXEL'] = paths['FTCNN_DS_DATA'] / 'Treatments_z11pixel_chip_labels.csv'
    paths['CSVZ11_CHIP_LABELS_PIXEL_ENCODED'] = paths['FTCNN_DS_DATA'] / 'Treatments_z11pixel_chip_labels_encoded.csv'
    paths['CSVZ11_CHIP_LABELS_PREYOLO'] = paths['FTCNN_DS_DATA'] / 'Treatments_z11pixel_chip_labels_encoded_preyolo.csv'

    for name, path in paths.items():
        path_map[name] = path
    
    make_directories(paths, verbose=verbose, exist_ok=exist_ok)

    path_map['SHPZ10_PRED_SHP'] = path_map['PRED_SHP'] / f"Treatmentsz10_{paths['MODEL_NAME']}.shp"
    path_map['SHPZ11_PRED_SHP'] = path_map['PRED_SHP'] / f"Treatmentsz11_{paths['MODEL_NAME']}.shp"

make_project_paths(path_map['ROOT'], exist_ok=True)

In [None]:
# BLANK
# Class encoder function
def classify(row):
    geom = row.get('geometry')
    return (0, "Treatment") if geom is not None and not geom.is_empty and geom.area > 1 else (-1, "Background")

In [None]:
# BLANK

# Dataset configuration settings
CHIP_SIZE = [320, 640]
YEARS=[2019, 2020, 2021, 2022, 2023]
SPLIT=0.75
SPLIT_MODE=['all', 'collection']
BACKGROUND_BIAS=1.0
SHUFFLE_SPLIT=[False, True]
SHUFFLE_BACKGROUND=True
TREATMENTS = [0,1,2,3,4,5,6,7]

# Error list
errors = []

# Configure TQDM progress bar
total_updates = (
    len(TREATMENTS) *
    (len(YEARS)-1) *
    len(SPLIT_MODE) *
    len(SHUFFLE_SPLIT) *
    len(CHIP_SIZE)
    )
root_pbar = trange(total_updates)
updates = 0

# Get chip size
for size in CHIP_SIZE:
    # Get treatment
    for treatment in TREATMENTS: 
        # Get years
        for i in range(len(YEARS)-1):
            years = (YEARS[i], YEARS[i+1])
            # Get split mode
            for mode in SPLIT_MODE:
                # Get split flag
                for shuffle_split in SHUFFLE_SPLIT:
                    
                    # Update pbar message
                    root_pbar.set_description(f"Creating dataset {updates+1}: T: {'all' if treatment == 0 else str(treatment)}, Y: {years[0]},{years[1]}, SM: {mode}, S: {SPLIT}, SS: {shuffle_split}, B: {BACKGROUND_BIAS}, SB: {SHUFFLE_BACKGROUND}")
    
                    # Create the model name and create its repository
                    path_map['MODEL_NAME'] = f"yolo_treatments={'all' if treatment == 0 else str(treatment)}_years={years[0]}to{years[1]}_chipsz={size if size is not None else 'Default'}_split={int(SPLIT*100)}_mode={mode}_shuffle-split={shuffle_split}_bg={str(BACKGROUND_BIAS).replace('.','_')}{'' if not SHUFFLE_BACKGROUND else '_shuffle-bg=True'}"
                    make_dataset_paths(
                        path_map['FTCNN_DS'], 
                        path_map['FTCNN_MODELS'],  
                        path_map['MODEL_NAME'], 
                        verbose=False, 
                        exist_ok=True
                    )
    
                    # Load the master shapefile
                    shpz10 = ftcnn.io.load_shapefile(path_map['SHPZ10_SHP'])
                    if treatment == 0:
                        # All treatments
                        shpz10 = shpz10[shpz10['TreatmentT'] != 8]
                    else:
                        # Individual treatment
                        shpz10 = shpz10[shpz10['TreatmentT'] == treatment]
    
                    # Rename these rows to align filenames that are parsed later
                    shpz10.loc[shpz10['Subregion'] == "Humboldt", "Subregion"] = 'Humboldt4'
    
                    # Declare the path of the base files used for this dataset
                    BASE_FILEPATH = Path(f'base_years={years[0]}to{years[1]}', 'Treatments_UTMz10_Only_08-18-24')
                    
                    # Save the files
                    ftcnn.io.save_as_csv(shpz10, path_map['FTCNN_DS_CSV'] / BASE_FILEPATH.with_suffix('.csv'), exist_ok=True)
                    ftcnn.io.save_as_shp(shpz10, path_map['FTCNN_DS_SHP'] / BASE_FILEPATH.with_suffix('.shp'), exist_ok=True)
    
                    try:
                        # Make the dataset using the base shapefile
                        yolo_ds = ftcnn.datasets.YOLONDVIDifferenceDataset.create(
                                source = path_map['FTCNN_DS_SHP'] / BASE_FILEPATH.with_suffix('.shp'),
                                images_dir = ndvi_path,
                                output_dir = path_map['FTCNN_DS_MODEL'],
                                year_start_column = "StartYear",
                                year_end_column = "EndYear",
                                geometry_column = "geometry",
                                years = years,
                                background = None,
                                background_ratio = 1.0,
                                split= DatasetSplitMode.All,
                                split_ratio= 0.7,  # 0.7 (70/30)
                                shuffle_split = True,  # True/False
                                shuffle_background = True,  # True/False
                                generate_labels = True,
                                generate_train_data = True,  # True/False
                                tile_size=size,
                                translate_xy = True,  # True/False
                                class_encoder= encode_classes,  # None or callback(row)
                                exist_ok = True,  # True/False
                                clear_output_dir = True,  # True/False
                                save_shp = True,  # True/False
                                save_gpkg = True,  # True/False
                                save_csv = True,  # True/False
                                pbar_leave = True,  # True/False
                                convert_to_png = True,
                                use_segments = True,
                                num_workers = 8,
                            )
                        if len(yolo_ds.images) < 40:
                            # If the size is too small the dataset encounters issues so we limit it to a 
                            # size that may provide a decent number of images for training
                            raise ValueError("Too few images to be viable dataset")
    
                        # (Optional) Change the root path of the dataset to the target directory where it will be used later on
                        yolo_ds.generate_yaml_file(
                            root_abs_path=Path(SOME_OTHER_PATH, path_map['MODEL_NAME']),
                            dest_abs_path=path_map['FTCNN_DS_MODEL'] / 'config',
                        )
                    except Exception as e:
                        # Append the error message and remove the created files
                        errors.append(f"{path_map['MODEL_NAME']}: {e}")
                        shutil.rmtree(path_map['FTCNN_DS_MODEL'])
    
                    # Updae the pbar and update counter
                    root_pbar.update()
                    updates += 1

root_pbar.set_description(f"Dataset completed with {len(errors)} errors.")
root_pbar.refresh()
root_pbar.close()

if len(errors) > 0:
    print("The following errors occurred:\n", "\n".join(errors), file=sys.stderr)


#### Errors
There may be errors that are encountered that resemble `RuntimeWarning: Value '/path/to/some/image.tif' of field path has been truncated to 254 characters.  This warning will not be emitted any more for that layer.`
This is because the filepath associated with the image is longer than what is acceptable in a Shapefile. It is OK to ignore these errors, unless you need to retain that field.