In [1]:
import os
import geopandas as gpd
import logging
import logging.config
import json

# import parallelization packages
import parsl
from parsl import python_app
from parsl.config import Config
from parsl.channels import LocalChannel
from parsl.executors import HighThroughputExecutor
from parsl.providers import LocalProvider

# PDG packages
import pdgstaging # used within the class we create called `StagedTo3DConverter`
import pdgraster
import py3dtiles

# note: do not need to import parsl because we do that in the function we create, init_parsl() ?

In [2]:
# track events that happen as software is executed, helpful for debugging
logger = logging.getLogger(__name__)

### Define a class that orchestrates viz-3dtiles classes and communicates information between them

- `__init__` is used to create an object from a class, it is only used within classes
- using `__init__` is the "constructor method" that is derived from C++ and Java
- functions defined within a class are called methods
- using `__init__` results in the methods being automatically applied to any object that is created of the class `StagedTo3DConverter`
- in summary, this means that when an object of class `StagedTo3DConverter` is created, the configuration .json file is automatically applied to that object
- the methods that are defined after this initiation happens so not automatically occur, they must be deliberately applied
- 

In [3]:
class StagedTo3DConverter():
    """
        Processes staged vector data into Cesium 3D tiles according to the
        settings in a config file or dict. This class acts as the orchestrator
        of the other viz-3dtiles classes, and coordinates the sending and
        receiving of information between them.
    """

    def __init__(
        self,
        config
    ):
        """
            Automatically initialize the StagedTo3DConverter class by appying the configuration when an object of that class is created.

            Parameters
            ----------
            self : need to explicitly state this parameter to pass any newly created object of class StagedTo3DConverter to the other paraneter (config)
                this is a python syntax requirement in order for the object to persist of this class

            config : dict or str
                A dictionary of configuration settings or a path to a config
                JSON file. (See help(pdgstaging.ConfigManager))

            Notes
            ----------
            - this function does not do the staging or tiling steps
        """

        self.config = pdgstaging.ConfigManager(config)
        self.tiles = pdgstaging.TilePathManager(
            **self.config.get_path_manager_config())

    def all_staged_to_3dtiles(
        self
    ):
        """
            Process all staged vector tiles into 3D tiles. This is simply a loop that iterates the function staged_to_rdtile() over all files in the staged directory.
        """

        # Get the list of staged vector tiles
        paths = self.tiles.get_filenames_from_dir('staged')
        # Process each tile
        for path in paths:
            self.staged_to_3dtile(path)

    def staged_to_3dtile(self, path):
        """
            Convert a staged vector tile into a B3DM tile file and a matching
            JSON tileset file.
            - the B3DM tile is applied to the PDG portal for visualization purposes
            - the JSON serves as the metadata for that tile

            Parameters
            ----------
            path : str
                The path to the staged vector tile.

            Returns
            -------
            tile, tileset : Cesium3DTile, Tileset
                The Cesium3DTiles and Cesium3DTileset objects
        """

        try:
            
            # Get information about the tile from the path
            tile = self.tiles.tile_from_path(path)
            out_path = self.tiles.path_from_tile(tile, '3dtiles')

            tile_bv = self.bounding_region_for_tile(tile) # bv = bounding volumne

            # Get the filename of the tile WITHOUT the extension
            tile_filename = os.path.splitext(os.path.basename(out_path))[0]
            # Get the base of the path, without the filename
            tile_dir = os.path.dirname(out_path) + os.path.sep

            # Log the event
            logger.info(
                f'Creating 3dtile from {path} for tile {tile} to {out_path}.')

            # Read in the staged vector tile
            gdf = gpd.read_file(path)

            # Summary of following steps:
            # Now that we have the path to the staged vector tile esptablished and logged, 
            # the following checks are executed on each staged vector tile:
            # 1. check if the tile has any data to start with
            # 2. check if the centroid of the polygons within the tile are within the tile boundaries, remove if not
            # 3. check if polygons within the tile overlap, deduplicate them if they do
            # 4. check if the tile has any data left if deduplication was executed
            # 5. if there were errors in the above steps, log that for debugging

            
            # Check if the gdf is empty
            if len(gdf) == 0:
                logger.warning(
                    f'Vector tile {path} is empty. 3D tile will not be'
                    ' created.')
                return

            # Remove polygons with centroids that are outside the tile boundary
            prop_cent_in_tile = self.config.polygon_prop(
                'centroid_within_tile')
            gdf = gdf[gdf[prop_cent_in_tile]]

            # Check if deduplication should be performed
            dedup_here = self.config.deduplicate_at('3dtiles')
            dedup_method = self.config.get_deduplication_method()

            # Deduplicate if required
            if dedup_here and (dedup_method is not None):
                dedup_config = self.config.get_deduplication_config(gdf)
                dedup = dedup_method(gdf, **dedup_config)
                gdf = dedup['keep']

                # The tile could theoretically be empty after deduplication
                if len(gdf) == 0:
                    logger.warning(
                        f'Vector tile {path} is empty after deduplication.'
                        ' 3D Tile will not be created.')
                    return

            # Create & save the b3dm file
            ces_tile, ces_tileset = TreeGenerator.leaf_tile_from_gdf(
                gdf,
                dir=tile_dir,
                filename=tile_filename,
                z=self.config.get('z_coord'),
                geometricError=self.config.get('geometricError'),
                tilesetVersion=self.config.get('version'),
                boundingVolume=tile_bv
            )

            return ces_tile, ces_tileset

        except Exception as e:
            logger.error(f'Error creating 3D Tile from {path}.')
            logger.error(e)

    def parent_3dtiles_from_children(self, tiles, bv_limit=None):
        """
            Create parent Cesium 3D Tileset json files that point to
            of child JSON files in the tile tree hierarchy.

            Parameters
            ----------
            tiles : list of morecantile.Tile
                The list of tiles to create parent tiles for.
        """

        tile_manager = self.tiles
        config_manager = self.config

        tileset_objs = []

        # Make the next level of parent tiles
        for parent_tile in tiles:
            # Get the path to the parent tile
            parent_path = tile_manager.path_from_tile(parent_tile, '3dtiles')
            # Get just the base dir without the filename
            parent_dir = os.path.dirname(parent_path)
            # Get the filename of the parent tile, without the extension
            parent_filename = os.path.basename(parent_path)
            parent_filename = os.path.splitext(parent_filename)[0]
            # Get the children paths for this parent tile
            child_paths = tile_manager.get_child_paths(parent_tile, '3dtiles')
            # Remove paths that do not exist
            child_paths = tile_manager.remove_nonexistent_paths(child_paths)
            # Get the parent bounding volume
            parent_bv = self.bounding_region_for_tile(
                parent_tile, limit_to=bv_limit)
            # If the bounding region is outside t
            # Get the version
            version = config_manager.get('version')
            # Get the geometric error
            geometric_error = config_manager.get('geometricError')
            # Create the parent tile
            tileset_obj = TreeGenerator.parent_tile_from_children_json(
                child_paths,
                dir=parent_dir,
                filename=parent_filename,
                geometricError=geometric_error,
                tilesetVersion=version,
                boundingVolume=parent_bv
            )
            tileset_objs.append(tileset_obj)

        return tileset_objs

    def bounding_region_for_tile(self, tile, limit_to=None):
        """
        For a morecantile.Tile object, return a BoundingVolumeRegion object
        that represents the bounding region of the tile.

        Parameters
        ----------
        tile : morecantile.Tile
            The tile object.
        limit_to : list of float
            Optional list of west, south, east, north coordinates to limit
            the bounding region to.

        Returns
        -------
        bv : BoundingVolumeRegion
            The bounding region object.
        """
        tms = self.tiles.tms
        bounds = tms.bounds(tile)
        bounds = gpd.GeoSeries(
            box(bounds.left, bounds.bottom, bounds.right, bounds.top),
            crs=tms.crs)
        if limit_to is not None:
            bounds_limitor = gpd.GeoSeries(
                box(limit_to[0], limit_to[1], limit_to[2], limit_to[3]),
                crs=tms.crs)
            bounds = bounds.intersection(bounds_limitor)
        bounds = bounds.to_crs(BoundingVolumeRegion.CESIUM_EPSG)
        bounds = bounds.total_bounds

        region_bv = {
            'west': bounds[0], 'south': bounds[1],
            'east': bounds[2], 'north': bounds[3],
        }
        return region_bv


### Create function to configure parsl

Use this approach, setting up a `HighThroughputExecutor`, instead of the `kubernetes` approach.

In [4]:
def init_parsl():
    import parsl
    from parsl.config import Config
    from parsl.channels import LocalChannel
    from parsl.executors import HighThroughputExecutor
    from parsl.providers import LocalProvider
    # bash command to activate pdgviz virtual environment, this will be run 
    activate_conda = 'source /home/jcohen/.bashrc; conda activate pdgviz'
    htex_local = Config(
        executors=[
            HighThroughputExecutor(
                # label this executor instance as "htex_local"
                label="htex_local",
                # do not enable worker debug logging
                worker_debug=False,
                cores_per_worker=2,
                # Why is the max workers only 2?
                max_workers=2,
                # enables user to run locally on machine, as opposed to a slurm scheduler
                provider=LocalProvider(
                    channel=LocalChannel(),
                    init_blocks=1,
                    max_blocks=10,
                    # run the bash command to activate pdgviz virtual environment
                    worker_init=activate_conda
                ),
            )
        ],
    )
    # shut down the executor and clear the parsl engine
    parsl.clear()
    parsl.load(htex_local)

### Set up parsl app to create leaf 3d Cesium tiles in parallel

In [5]:
print("Defining function to create leaf 3d Cesium tiles from staged tiles in parallel.")

@python_app
def create_leaf_3dtiles(staged_paths, config, logging_dict=None):
    """
    Create a batch of leaf 3d tiles from staged vector tiles
    """
    # do i need to download this package still? Robyn said she has issues sourcing it, which is why we define it above as a method in the same script as this chunk, so I comment the following import out for now
    #from pdg_workflow import StagedTo3DConverter
    if logging_dict:
        import logging.config
        logging.config.dictConfig(logging_dict)
    converter3d = StagedTo3DConverter(config)
    tilesets = []
    for path in staged_paths:
        try: # the syntax that follows is confusing, it reminds me of the fig, ax syntax
            ces_tile, ces_tileset = converter3d.staged_to_3dtile(path) # the fact that we call this func here supports my understanding that the methods defined within stagedTo3DConverter are not automatically executed (except for the first one with __init__) until called, after the obj of that class is created
            tilesets.append(ces_tileset)
        except Exception as e:
            logging.error(f'Error creating 3d tile from {path}')
            logging.error(e)
    return tilesets

Defining function to create leaf 3d Cesium tiles from staged tiles in parallel.


### Set up parsl app to create parent 3d Cesium tiles in parallel

In [6]:
print("Defining function to create parent 3d Cesium tiles in parallel.")

@python_app
def create_parent_3dtiles(tiles, config, limit_bv_to=None, logging_dict=None):
    """
    Create a batch of cesium 3d tileset parent files that point to child
    tilesets
    """
    #from pdg_workflow import StagedTo3DConverter
    if logging_dict:
        import logging.config
        logging.config.dictConfig(logging_dict)
    logging.info(f'Creating parent 3d tiles for {len(tiles)} tiles')
    converter3d = StagedTo3DConverter(config)
    return converter3d.parent_3dtiles_from_children(tiles, limit_bv_to)

Defining function to create parent 3d Cesium tiles in parallel.


### Create function to batch the input data into the user's specified size 

In [7]:
def make_batch(items, batch_size):
    """
    Create batches of a given size from a list of items.
    """
    return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]

### Setup logging configuration for debugging

In [8]:
def setup_logging(log_json_file):
    """
    Setup logging configuration
    """
    with open(log_json_file, 'r') as f:
        logging_dict = json.load(f)
    logging.config.dictConfig(logging_dict)
    return logging_dict

### Define objects that will be used throughout parallelization:
- filepath to configuration .json file
- filepath for logging for debugging
- batch sizes for:
    - staging
    - rasterization
    - creating 3d tiles (both parents and leafs)
    - creating geotiffs
    - web tiles 

In [11]:
workflow_config = '/home/jcohen/lake_change_sample/ingmar-config__updated.json'
logging_config = '/home/jcohen/lake_change_sample/logging.json'
batch_size_staging = 1
batch_size_rasterization = 30
batch_size_3dtiles = 20 # leaf tiles? higher resolution, more zoomed in, which is why we process fewer of them in a batch relative to the parent tiles
batch_size_parent_3dtiles = 500
batch_size_geotiffs = 200
batch_size_web_tiles = 200

In [12]:
# we created this function in this script earlier, it reads in the logging configuration file
logging_dict = setup_logging(logging_config)

## Questions
1. are the 3dtiles referenced in the object batch_size_rdtiles the leaf tiles? I assumed cause the other object specifies they are parent 3d tiles 
2. difference between 3d tiles and web tiles

### Define the following:
- stager (didnt we already stage the files?)
- rasterizer (didnt we already rasterize the files?)
- 3d tiler
- tile manager (??)
- config manager (??)
- minimum zoom level
- maximum zoom level
- range of zoom levels (??)

In [None]:
stager = pdgstaging.TileStager(workflow_config)
tiles3dmaker = StagedTo3DConverter(workflow_config)
rasterizer = pdgraster.RasterTiler(workflow_config)
tile_manager = stager.tiles
config_manager = stager.config
min_z = config_manager.get_min_z() # parent tiles
max_z = config_manager.get_max_z() # child tiles
parent_zs = range(max_z - 1, min_z - 1, -1) # example: (11-1), (0-1), by -1 = 10 thru -1 by -1 = 11

### Get paths for staged tiles in the staged directory, then batch them

In [None]:
staged_paths = stager.tiles.get_filenames_from_dir('staged')
staged_batches = make_batch(staged_paths, batch_size_3dtiles)

for batch in staged_batches:
    create_leaf_3dtiles(batch, workflow_config)