# Workflow to map IDs of `TGF` and `PRMS-TGF`

Since the polygons IDs provided by the `PRMS` team are different from that of the [`TGF`](https://www.sciencebase.gov/catalog/item/5efcbb9582ce3fd7e8a5b9ea) dataset, a correspondence between the polygons IDs of the two datasets is needed. The following workflow, attempts to provide the relevant information:

## Importing necessary libraries

In [1]:
import xarray as xr # version 2023.2.0
import pandas as pd # version 1.5.3
import numpy as np # version 1.24.2
import geopandas as gpd # version 0.12.2
import networkx as nx # version 3.1

import matplotlib.pyplot as plt # version 3.7.0

import fiona # version 1.9.2

from typing import (
    Dict,
    Tuple,
    Union,
    List,
)

from collections.abc import (
    Callable,
)

### Defining paths and preparing objects

In [2]:
# defining paths
# downloaded from https://www.sciencebase.gov/catalog/item/5d967365e4b0c4f70d113923
tgf_path = '/Users/kasrakeshavarz/Documents/geospatial-data/NHM_TGF/TGF.gdb/'
# obtained from Jamie Kolodinsky (jamie.kolodinsky@ec.gc.ca)
prms_cat_path = '/Users/kasrakeshavarz/Documents/hydrological-models/20230605_gm_byHWobs_st_mary_and_milk_rivers/GIS/model_nhru.shp'
# slope values - downloaded from https://www.sciencebase.gov/catalog/item/5efcbb9582ce3fd7e8a5b9ea
prms_riv_path = '/Users/kasrakeshavarz/Documents/hydrological-models/20230605_gm_byHWobs_st_mary_and_milk_rivers/GIS/model_nsegment.shp'

# list tgf layers
fiona.listlayers(tgf_path)

['POIs', 'waterbodies', 'nhru', 'nsegment']

In [3]:
# read layers
## tgf river segments layer
tgf_riv = gpd.read_file(tgf_path, driver='FileGDB', layer='nsegment')
## tgf sub-basin layer
tgf_cat = gpd.read_file(tgf_path, driver='FileGDB', layer='nhru')
## prms river segments layer
prms_riv = gpd.read_file(prms_riv_path)
## prms sub-basin layer
prms_cat = gpd.read_file(prms_cat_path)

## Necessary functions

In [4]:
# ChatGPT prompt: Calculate the centroid of each polygon in two given
#                 shapefiles and find the closest polygon ID from the
#                 second shapefile for each polygon in the first
#                 shapefile using parallel processing based on
#                 centroid values.

import geopandas as gpd
from joblib import Parallel, delayed

# Read the shapefiles
shapefile1 = prms_cat.copy()
shapefile2 = tgf_cat.copy()

# Calculate the centroid of each polygon in both shapefiles
shapefile1['centroid'] = shapefile1.centroid
shapefile2['centroid'] = shapefile2.centroid

# Function to find closest polygon ID
def find_closest_polygon(row1, shapefile2):
    closest_id = None
    closest_distance = float('inf')
    for idx2, row2 in shapefile2.iterrows():
        distance = row1['centroid'].distance(row2['centroid'])
        if distance < closest_distance:
            closest_distance = distance
            closest_id = row2['hru_id_nhm']  # Replace 'id' with the column name containing the ID value
        
    return row1['nhm_id'], closest_id

## Printing outputs

In [5]:
# Parallelize the centroid proximity calculation
closest_ids = Parallel(n_jobs=-1)(delayed(find_closest_polygon)(row1, shapefile2) for idx1, row1 in shapefile1.iterrows())



In [6]:
# Make a pandas DataFrame
df_ids = pd.DataFrame(closest_ids)
# Set the first column as the index
df_ids.set_index(keys=[0], inplace=True)
# assign a column name for the sole column
df_ids.columns = ['TGF_nhm_id']
# concatenate and make correspondence
shp = gpd.GeoDataFrame(pd.concat([prms_cat.set_index(keys=['nhm_id'], inplace=False), df_ids], axis=1)).reset_index(names='prms_nhm_id')

In [7]:
# save the shapefile if necessary
shp.to_file('./map_ids.shp')

  shp.to_file('./map_ids.shp')


In [8]:
# assign indexing slice method from pandas specifically
idx = pd.IndexSlice

# select the two columns of data mapping polygons values between
# the two object
df = shp.loc[:, idx['prms_nhm_id', 'TGF_nhm_id']]

In [9]:
# save the mapping data as a .csv file
df.to_csv('./map_ids.csv')