## Overall Workflow (without parallelization)

This document outlines the PDG workflow, essentially parsing [this](https://github.com/PermafrostDiscoveryGateway/viz-workflow/blob/parsl-workflow/pdg_workflow/pdg_workflow.py) workflow but omitting parallelization.

In [1]:
import os

# visualization
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import box

# PDG packages
import pdgstaging
import pdgraster
import py3dtiles
import viz_3dtiles
from viz_3dtiles import TreeGenerator, BoundingVolumeRegion
#import pdgpy3dtiles
#from StagedTo3DConverter import StagedTo3DConverter

# logging and configuration
from datetime import datetime
import logging
import logging.config
import argparse
import json



Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


### Step 1: Define variables and configuration files

In [2]:
# input data: sample of lake change data from Ingmar
input = gpd.read_file('/home/pdg/data/nitze_lake_change/data_sample_2022-09-09/32607/05_Lake_Dataset_Raster_02_final/lake_change.gpkg')

workflow_config = '/home/jcohen/lake_change_sample/ingmar-config.json'
logging_config = '/home/jcohen/lake_change_sample/logging.json'

# the following values are the defaults in the custom function `run_pdg_workflow()`
batch_size_staging=1
batch_size_rasterization=30
batch_size_3dtiles=20
batch_size_parent_3dtiles=500
batch_size_geotiffs=200
batch_size_web_tiles=200

# track events that happen as software is executed, helpful for debugging, this came from StagedTo3DConverter.py
logger = logging.getLogger(__name__)

In [3]:
def setup_logging(log_json_file):
    """
    Setup logging configuration
    """
    with open(log_json_file, 'r') as f:
        logging_dict = json.load(f)
    logging.config.dictConfig(logging_dict)
    return logging_dict

# if __name__ == "__main__":

#     parser = argparse.ArgumentParser(
#         description='Run the PDG visualization workflow.')
#     parser.add_argument('-c', '--config',
#                         help='Path to the pdg-viz configuration JSON file.',
#                         default='config.json',
#                         type=str)
#     parser.add_argument('-l', '--logging',
#                         help='Path to the logging configuration JSON file.',
#                         default='logging.json',
#                         type=str)
#     args = parser.parse_args()

logging_dict = setup_logging(logging_config)

### Step 2: define custom class and methods to be used throughout workflow

##### Define a class that orchestrates viz-3dtiles classes and communicates information between them

- `__init__` is used to create an object from a class, it is only used within classes
- using `__init__` is the "constructor method" that is derived from C++ and Java
- functions defined within a class are called methods
- using `__init__` results in the methods being automatically applied to any object that is created of the class `StagedTo3DConverter`
- essentially, this means that when an object of class `StagedTo3DConverter` is created, the configuration .json file is automatically applied to that object
- the methods that are defined after this initiation happens do not automatically occur, they must be deliberately applied, which we do later in the workflow by running `class.method()` to create the ceisum 3d files from the satged directory

In [4]:
class StagedTo3DConverter():
    """
        Processes staged vector data into Cesium 3D tiles according to the
        settings in a config file or dict. This class acts as the orchestrator
        of the other viz-3dtiles classes, and coordinates the sending and
        receiving of information between them.
    """

    def __init__(
        self,
        config
    ):
        """
            Automatically initialize the StagedTo3DConverter class by appying the configuration when an object of that class is created.

            Parameters
            ----------
            self : need to explicitly state this parameter to pass any newly created object of class StagedTo3DConverter to the other paraneter (config)
                this is a python syntax requirement in order for the object to persist of this class

            config : dict or str
                A dictionary of configuration settings or a path to a config
                JSON file. (See help(pdgstaging.ConfigManager))

            Notes
            ----------
            - this function does not do the staging or tiling steps
        """

        self.config = pdgstaging.ConfigManager(config)
        self.tiles = pdgstaging.TilePathManager(
            **self.config.get_path_manager_config())

    def all_staged_to_3dtiles(
        self
    ):
        """
            Process all staged vector tiles into 3D tiles. This is simply a loop that iterates the function staged_to_rdtile() over all files in the staged directory.
        """

        # Get the list of staged vector tiles
        paths = self.tiles.get_filenames_from_dir('staged')
        # Process each tile
        for path in paths:
            self.staged_to_3dtile(path)

    def staged_to_3dtile(self, path):
        """
            Convert a staged vector tile into a B3DM tile file and a matching
            JSON tileset file.
            - the B3DM tile is applied to the PDG portal for visualization purposes
            - the JSON serves as the metadata for that tile

            Parameters
            ----------
            path : str
                The path to the staged vector tile.

            Returns
            -------
            tile, tileset : Cesium3DTile, Tileset
                The Cesium3DTiles and Cesium3DTileset objects
        """

        try:
            
            # Get information about the tile from the path
            tile = self.tiles.tile_from_path(path)
            out_path = self.tiles.path_from_tile(tile, '3dtiles')

            tile_bv = self.bounding_region_for_tile(tile) # bv = bounding volumne

            # Get the filename of the tile WITHOUT the extension
            tile_filename = os.path.splitext(os.path.basename(out_path))[0]
            # Get the base of the path, without the filename
            tile_dir = os.path.dirname(out_path) + os.path.sep

            # Log the event
            logger.info(
                f'Creating 3dtile from {path} for tile {tile} to {out_path}.')

            # Read in the staged vector tile
            gdf = gpd.read_file(path)

            # Summary of following steps:
            # Now that we have the path to the staged vector tile esptablished and logged, 
            # the following checks are executed on each staged vector tile:
            # 1. check if the tile has any data to start with
            # 2. check if the centroid of the polygons within the tile are within the tile boundaries, remove if not
            # 3. check if polygons within the tile overlap, deduplicate them if they do
            # 4. check if the tile has any data left if deduplication was executed
            # 5. if there were errors in the above steps, log that for debugging

            
            # Check if the gdf is empty
            if len(gdf) == 0:
                logger.warning(
                    f'Vector tile {path} is empty. 3D tile will not be'
                    ' created.')
                return

            # Remove polygons with centroids that are outside the tile boundary
            prop_cent_in_tile = self.config.polygon_prop(
                'centroid_within_tile')
            gdf = gdf[gdf[prop_cent_in_tile]]

            # Check if deduplication should be performed
            dedup_here = self.config.deduplicate_at('3dtiles')
            dedup_method = self.config.get_deduplication_method()

            # Deduplicate if required
            if dedup_here and (dedup_method is not None):
                dedup_config = self.config.get_deduplication_config(gdf)
                dedup = dedup_method(gdf, **dedup_config)
                gdf = dedup['keep']

                # The tile could theoretically be empty after deduplication
                if len(gdf) == 0:
                    logger.warning(
                        f'Vector tile {path} is empty after deduplication.'
                        ' 3D Tile will not be created.')
                    return

            # Create & save the b3dm file
            ces_tile, ces_tileset = TreeGenerator.leaf_tile_from_gdf(
                gdf,
                dir=tile_dir,
                filename=tile_filename,
                z=self.config.get('z_coord'),
                geometricError=self.config.get('geometricError'),
                tilesetVersion=self.config.get('version'),
                boundingVolume=tile_bv
            )

            return ces_tile, ces_tileset

        except Exception as e:
            logger.error(f'Error creating 3D Tile from {path}.')
            logger.error(e)

    def parent_3dtiles_from_children(self, tiles, bv_limit=None):
        """
            Create parent Cesium 3D Tileset json files that point to
            of child JSON files in the tile tree hierarchy.

            Parameters
            ----------
            tiles : list of morecantile.Tile
                The list of tiles to create parent tiles for.
        """

        tile_manager = self.tiles
        config_manager = self.config

        tileset_objs = []

        # Make the next level of parent tiles
        for parent_tile in tiles:
            # Get the path to the parent tile
            parent_path = tile_manager.path_from_tile(parent_tile, '3dtiles')
            # Get just the base dir without the filename
            parent_dir = os.path.dirname(parent_path)
            # Get the filename of the parent tile, without the extension
            parent_filename = os.path.basename(parent_path)
            parent_filename = os.path.splitext(parent_filename)[0]
            # Get the children paths for this parent tile
            child_paths = tile_manager.get_child_paths(parent_tile, '3dtiles')
            # Remove paths that do not exist
            child_paths = tile_manager.remove_nonexistent_paths(child_paths)
            # Get the parent bounding volume
            parent_bv = self.bounding_region_for_tile(
                parent_tile, limit_to=bv_limit)
            # If the bounding region is outside t
            # Get the version
            version = config_manager.get('version')
            # Get the geometric error
            geometric_error = config_manager.get('geometricError')
            # Create the parent tile
            tileset_obj = TreeGenerator.parent_tile_from_children_json(
                child_paths,
                dir=parent_dir,
                filename=parent_filename,
                geometricError=geometric_error,
                tilesetVersion=version,
                boundingVolume=parent_bv
            )
            tileset_objs.append(tileset_obj)

        return tileset_objs

    def bounding_region_for_tile(self, tile, limit_to=None):
        """
        For a morecantile.Tile object, return a BoundingVolumeRegion object
        that represents the bounding region of the tile.

        Parameters
        ----------
        tile : morecantile.Tile
            The tile object.
        limit_to : list of float
            Optional list of west, south, east, north coordinates to limit
            the bounding region to.

        Returns
        -------
        bv : BoundingVolumeRegion
            The bounding region object.
        """
        tms = self.tiles.tms
        bounds = tms.bounds(tile)
        bounds = gpd.GeoSeries(
            box(bounds.left, bounds.bottom, bounds.right, bounds.top),
            crs=tms.crs)
        if limit_to is not None:
            bounds_limitor = gpd.GeoSeries(
                box(limit_to[0], limit_to[1], limit_to[2], limit_to[3]),
                crs=tms.crs)
            bounds = bounds.intersection(bounds_limitor)
        bounds = bounds.to_crs(BoundingVolumeRegion.CESIUM_EPSG)
        bounds = bounds.total_bounds

        region_bv = {
            'west': bounds[0], 'south': bounds[1],
            'east': bounds[2], 'north': bounds[3],
        }
        return region_bv

### Step 3: Configuring the stager, raster tiler, and 3D tiler

In [5]:
# staging configuration
stager = pdgstaging.TileStager(workflow_config)
tile_manager = stager.tiles
config_manager = stager.config

# zoom levels configuration
min_z = config_manager.get_min_z()
max_z = config_manager.get_max_z()
parent_zs = range(max_z - 1, min_z - 1, -1)

# 3D tiler configuration
tiles3dmaker = StagedTo3DConverter(workflow_config)

# raster tilerconfiguration 
rasterizer = pdgraster.RasterTiler(workflow_config)

### Step 4: Stage the input files

By staging the files using the stager we configured with the .json script, we created a `staged` directory of the tiles in a deliberate hierarchial structure. Each layer of the directory is as follows:
- **staged**: base folder for all tiles (lakes)
- **WorldCRS84Quad**: the tile matrix set grid, which is in geographic coordinates, allowing the tiles to appear square when represented on the 3D Globe on the PDG web portal (in Cesium format)
- **11**: style (number of zoom levels, the "z-range")
- **numbered subfolders, for example 406-481**: tile matrix (x)
- **numbered tiles, for example 228.gpkg:** tile column (y)

See [here](https://github.com/PermafrostDiscoveryGateway/viz-staging/blob/main/docs/tile_path_structure.md) for a schematic of this hierarchial directory

In [None]:
# if there are many input files, this would be a different process with batching

# only 1 gpkg file to input for this sample dataset:
#stager.stage(input)
# we comment out that line because we already did that step to create the staged folder

### Step 5: Deduplicate and rasterize tiles in staged directory

- This step rasterizes all staged tiles, and does not require an input path because the `staged` directory is defined in the .json configuration.
- deduplicating the staged files needs to happen again when we create the 3d tiles, because those also pull from the staged directory, rather than the rasterized files that are deduplicated in the following code

In [None]:
#rasterizer.rasterize_all()
# we comment out that line because we already did that step to create the geotiff and web_tiles directories
# with the sample data, this step took 1.5 hours because no parallelization has been implemented yet (parsl will be integrated in time!)

### Step 6: Create parent geotiffs for all z-levels (except highest)

In [6]:
# got this function def from line 350 in pdg_worflow.py

def create_composite_geotiffs(tiles, config, logging_dict=None):
    """
    Make composite geotiffs (step 3)
    """
    import pdgraster
    if logging_dict:
        import logging.config
        logging.config.dictConfig(logging_dict)
    rasterizer = pdgraster.RasterTiler(config) 
    return rasterizer.parent_geotiffs_from_children(tiles, recursive=False)

In [7]:
for z in parent_zs: 

    # Determine which tiles we need to make for the next z-level based on the
    # path names of the files just created
    child_paths = tile_manager.get_filenames_from_dir('geotiff', z=z + 1)
    parent_tiles = set()
    for child_path in child_paths:
        parent_tile = tile_manager.get_parent_tile(child_path)
        parent_tiles.add(parent_tile)
    parent_tiles = list(parent_tiles)

    # composite_geotiffs = []
    # for parent_tile in parent_tiles:
    #     composite_geotiff = create_composite_geotiffs(
    #         parent_tiles, workflow_config, logging_dict)
    #     composite_geotiffs.append(composite_geotiff)

    # [a.result() for a in composite_geotiffs]

    create_composite_geotiffs(tiles = parent_tiles, config = workflow_config, logging_dict = logging_dict)   
            

        


### Step 7: Create web tiles from geotiffs


In [8]:
rasterizer.update_ranges()

In [9]:
geotiff_paths = tile_manager.get_filenames_from_dir('geotiff')
len(geotiff_paths)
geotiff_paths[7767]


'geotiff/WorldCRS84Quad/0/0/0.tif'

In [10]:
def create_web_tiles(geotiff_paths, config, logging_dict=None):
    """
    Create a batch of webtiles from geotiffs (step 4)
    """
    import pdgraster
    if logging_dict:
        import logging.config
        logging.config.dictConfig(logging_dict)
    rasterizer = pdgraster.RasterTiler(config)
    return rasterizer.webtiles_from_geotiffs(
        geotiff_paths, update_ranges=False)

In [11]:
create_web_tiles(geotiff_paths, workflow_config, logging_dict)

  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (max_val - min_val))
  (255 / (

In [None]:
# Process web tiles NOT in batches
#rasterizer.webtiles_from_geotiffs(geotiff_paths, update_ranges=False)

### Step 8: Deduplicate and make leaf 3D tiles all staged tiles (only highest z-level)

In [12]:
staged_paths = stager.tiles.get_filenames_from_dir('staged')

In [None]:
#converter3d = StagedTo3DConverter(workflow_config)
#converter3d

In [None]:
# def create_leaf_3dtiles(staged_paths, config, logging_dict=None):
#     """
#     Create a batch of leaf 3d tiles from staged vector tiles
#     """
#     #from pdg_workflow import StagedTo3DConverter
#     if logging_dict:
#         import logging.config
#         logging.config.dictConfig(logging_dict)
#     #converter3d = StagedTo3DConverter(config)
#     #tilesets = []
#     for path in staged_paths:
#         tiles3dmaker.staged_to_3dtile(path) # tiles3dmaker = converter3d if converter3d = StagedTo3DConverter(workflow_config)

In [13]:
def create_leaf_3dtiles(staged_paths, config, logging_dict=None):
    """
    Create a batch of leaf 3d tiles from staged vector tiles
    """
    #from pdg_workflow import StagedTo3DConverter
    if logging_dict:
        import logging.config
        logging.config.dictConfig(logging_dict)
    converter3d = StagedTo3DConverter(config)
    tilesets = []
    for path in staged_paths:
        ces_tile, ces_tileset = converter3d.staged_to_3dtile(path) # tiles3dmaker = converter3d if converter3d = StagedTo3DConverter(workflow_config)
        tilesets.append(ces_tileset)
    return tilesets
    


In [14]:
create_leaf_3dtiles(staged_paths = staged_paths, config = workflow_config, logging_dict = logging_dict)

TypeError: cannot unpack non-iterable NoneType object

In [15]:
# alternative to running the function defined above, create_leaf_3dtiles():
tiles3dmaker.all_staged_to_3dtiles()

### Step 9: Create parent Cesium 3D tilesets for all z-levels (except highest)

In [16]:
max_z_tiles = [tile_manager.tile_from_path(path) for path in staged_paths]
# get the total bounds for all the tiles
max_z_bounds = [tile_manager.get_bounding_box(tile) for tile in max_z_tiles]
# get the total bounds for all the tiles
polygons = [box(bounds['left'],
                bounds['bottom'],
                bounds['right'],
                bounds['top']) for bounds in max_z_bounds]
max_z_bounds = gpd.GeoSeries(polygons, crs=tile_manager.tms.crs)

bound_volume_limit = max_z_bounds.total_bounds

In [17]:
for z in parent_zs:

    # Determine which tiles we need to make for the next z-level based on the
    # path names of the files just created
    all_child_paths = tiles3dmaker.tiles.get_filenames_from_dir('3dtiles', z=z + 1)

    parent_tiles = set()
    for child_path in all_child_paths:
        parent_tile = tile_manager.get_parent_tile(child_path)
        parent_tiles.add(parent_tile)
    parent_tiles = list(parent_tiles)

In [None]:
all_child_paths

In [18]:
def create_parent_3dtiles(tiles, config, limit_bv_to=None, logging_dict=None):
    """
    Create a batch of cesium 3d tileset parent files that point to child
    tilesets
    """
    #from pdg_workflow import StagedTo3DConverter
    if logging_dict:
        import logging.config
        logging.config.dictConfig(logging_dict)
    converter3d = StagedTo3DConverter(config)
    return converter3d.parent_3dtiles_from_children(tiles, limit_bv_to)

In [19]:
create_parent_3dtiles(parent_tiles, workflow_config, bound_volume_limit, logging_dict)

[]