In [1]:
### WLP_Salmon_Spawning_Survey_DataJoinSummary_v1.py
### Version: 3/2/2022
### Author: Khem So, khem_so@fws.gov, (503) 231-6839
### Abstract: This Python 3 script pulls data from the HI Waterbirds Reproductive Success ArcGIS Online feature service and performs joins and merges to result in a combined CSV dataset.

In [2]:
import arcpy
import pandas as pd
from arcgis import GIS
import time, os, fnmatch, shutil
import openpyxl

In [3]:
### ArcGIS Online stores date-time information in UTC by default. This function uses the pytz package to convert time zones and can be used to convert from UTC ("UTC") to localized time. For example, localized "US/Pacific" is either Pacific Standard Time UTC-8 or Pacific Daylight Time UTC-7 depending upon time of year.
from datetime import datetime
from pytz import timezone
def change_timezone_of_field(df, source_date_time_field, new_date_time_field_suffix, source_timezone, new_timezone):
    """Returns the values in *source_date_time_field* with its timezone converted to a new timezone within a new field *new_date_time_field*
    : param df: The name of the spatially enabled or pandas DataFrame containing datetime fields
    : param source_date_time_field: The name of the datetime field whose timezone is to be changed
    : param new_date_time_field_suffix: Suffix appended to the end of the name of the source datetime field. This is used to create the new date time field name.
    : param source_timezone: The name of the source timezone
    : param new_timezone: The name of the converted timezone. For possible values, see https://gist.github.com/heyalexej/8bf688fd67d7199be4a1682b3eec7568
    """
    # Define the source timezone in the source_date_time_field
    df[source_date_time_field] = df[source_date_time_field].dt.tz_localize(source_timezone)
    # Define the name of the new date time field
    new_date_time_field = source_date_time_field + new_date_time_field_suffix
    # Convert the datetime in the source_date_time_field to the new timezone in a new field called new_date_time_field
    df[new_date_time_field] = df[source_date_time_field].dt.tz_convert(new_timezone)

In [4]:
### This function converts Python datetime64 fields to %m/%d/%Y %H:%M:%S %Z%z format
def archive_dt_field(df):
    """Selects fields with data types of 'datetime64[ns, UTC]','datetime64[ns, US/Pacific]' and converts to %m/%d/%Y %H:%M:%S %Z%z format for archiving to Excel
    : param df: The name of the spatially enabled or pandas DataFrame containing datetime fields
    """
    archive_dt_field_list = df.select_dtypes(include=['datetime64[ns, UTC]','datetime64[ns, US/Pacific]'])
    for col in archive_dt_field_list:
        df[col] = df[col].dt.strftime('%m/%d/%Y %H:%M:%S %Z%z')

In [5]:
### Allow authentication via login to U.S. Fish & Wildlife Service ArcGIS Online account via ArcGIS Pro
gis = GIS("pro")

In [6]:
### Enter year of interest
# uncomment next line to use ArcGIS interface, otherwise hard coding year
# year = arcpy.GetParameterAsText(0)
year = "2021"

In [7]:
### Enter path for local file saving
# uncomment next line to use ArcGIS interface, otherwise hard coding out_workspace
# out_workspace = arcpy.GetParameterAsText(1)
out_workspace = "C:/Users/kso/Desktop/"

In [8]:
### Create timestamp for file naming
t = time.localtime()
timestamp = time.strftime('%Y-%m-%d_%H%M', t)

In [9]:
### Paths to ArcGIS Online data
# To populate Service ItemId, go to Feature Service webpage and in bottom right corner, click on the View link.
# Current Feature Service webpage: https://fws.maps.arcgis.com/home/item.html?id=758626eec0fc4bc1a72b4e4c9bd1023c
ServiceItemID = gis.content.get("758626eec0fc4bc1a72b4e4c9bd1023c")

### There are separate methods for pulling spatial versus non-spatial data into Python. Spatial layers will become Spatially Enabled DataFrame objects. Non-spatial data will become regular pandas DataFrame objects.
## Define variables pointing to spatial layers
MetadataLyr = ServiceItemID.layers[0]
LiveFishLyr = ServiceItemID.layers[1]
CarcassLyr = ServiceItemID.layers[2]
## Create Spatially Enabled DataFrame objects
sedfMetadata = pd.DataFrame.spatial.from_layer(MetadataLyr)
sedfLiveFishLocation = pd.DataFrame.spatial.from_layer(LiveFishLyr)
sedfCarcassLocation = pd.DataFrame.spatial.from_layer(CarcassLyr)

## Define variables point to non-spatial (tabular) data
Observer = r"https://services.arcgis.com/QVENGdaPbd4LUkLV/arcgis/rest/services/service_c555c76424ca452d8dab8de4f8c25000/FeatureServer/3"

## Convert AGOL table to NumPy Array and then to pandas DataFrames
naObserver = arcpy.da.TableToNumPyArray(Observer,["objectid","globalid","strFirstName","strLastName","parentglobalid","CreationDate","Creator","EditDate","Editor"])
dfObserver = pd.DataFrame(naObserver)

In [10]:
### Use change_timezone_of_field function to convert all datetime fields in dataframe from UTC to Pacific within new field with _Pacific suffix
for col in sedfMetadata.columns:
     if sedfMetadata[col].dtype == 'datetime64[ns]':
         change_timezone_of_field(sedfMetadata, col, "_Pacific", "UTC", "US/Pacific")

for col in sedfLiveFishLocation.columns:
     if sedfLiveFishLocation[col].dtype == 'datetime64[ns]':
         change_timezone_of_field(sedfLiveFishLocation, col, "_Pacific", "UTC", "US/Pacific")

for col in sedfCarcassLocation.columns:
     if sedfCarcassLocation[col].dtype == 'datetime64[ns]':
         change_timezone_of_field(sedfCarcassLocation, col, "_Pacific", "UTC", "US/Pacific")

for col in dfObserver.columns:
     if dfObserver[col].dtype == 'datetime64[ns]':
         change_timezone_of_field(dfObserver, col, "_Pacific", "UTC", "US/Pacific")

In [11]:
### Filter sedfMetadata by single year
sedfMetadataYYYY = sedfMetadata[sedfMetadata["dtmDate"].dt.strftime('%Y') == year]

In [12]:
### Export raw data frames as backup
## Use archive_dt_field function to convert Python date time into format Excel can read more easily
archive_dt_field(sedfMetadata)
archive_dt_field(sedfLiveFishLocation)
archive_dt_field(sedfCarcassLocation)
archive_dt_field(dfObserver)

## Create export paths for backup and writes to Excel spreadsheet
writer = pd.ExcelWriter(os.path.join(out_workspace,('WLP_Salmon_Spawning_Survey_BKUP_' + timestamp + '.xlsx')))
sedfMetadata.to_excel(writer, 'Metadata')
sedfLiveFishLocation.to_excel(writer, 'Live Fish')
sedfCarcassLocation.to_excel(writer, 'Carcasses')
dfObserver.to_excel(writer, 'Observers')
writer.save()

In [13]:
### Create dfObserver2 data frame with concatenated surveyor names grouped by parentglobalid
## Clean up names
dfObserver["strFirstName"] = dfObserver["strFirstName"].str.strip()
dfObserver["strLastName"] = dfObserver["strLastName"].str.strip()

## Process dfObserver to get single concatenated field for full name
dfObserver["strFullName"] = dfObserver["strFirstName"] + " " + dfObserver["strLastName"]

## Process dfObserver to remove curly brackets to allow for join based on GUID
dfObserver = dfObserver.replace("{","", regex=True)
dfObserver = dfObserver.replace("}","", regex=True)

## Process dfObserver to get concatenated list of full surveyor names by survey
dfObserver2 = dfObserver[["parentglobalid", "strFullName"]]
dfObserver2 = dfObserver2.groupby("parentglobalid").agg({"strFullName": ', '.join})

In [14]:
### Join sedfMetadataYYYY with dfObserver
dfMetadataObserver = pd.merge(sedfMetadataYYYY,dfObserver2, how="left", left_on="globalid", right_on="parentglobalid")

In [15]:
### Manipulate date/time fields in dfMetadataObserver
## Strip time from dtmDate_Pacific
dfMetadataObserver["dtmDate_Pacific"] = dfMetadataObserver["dtmDate_Pacific"].dt.strftime('%m/%d/%Y')

## Calculate total survey time
dfMetadataObserver["dtmManualTimeStart_dt"] = dfMetadataObserver["dtmDate_Pacific"] + " " + dfMetadataObserver["dtmManualTimeStart"]
dfMetadataObserver["dtmManualTimeStart_dt"] = pd.to_datetime(dfMetadataObserver["dtmManualTimeStart_dt"],format="%m/%d/%Y %H:%M")

dfMetadataObserver["dtmManualTimeEnd_dt"] = dfMetadataObserver["dtmDate_Pacific"] + " " + dfMetadataObserver["dtmManualTimeEnd"]
dfMetadataObserver["dtmManualTimeEnd_dt"] = pd.to_datetime(dfMetadataObserver["dtmManualTimeEnd_dt"],format="%m/%d/%Y %H:%M")

dfMetadataObserver["dtmManualTimeTotal"] = dfMetadataObserver["dtmManualTimeEnd_dt"] - dfMetadataObserver["dtmManualTimeStart_dt"]

dfMetadataObserver["dtmManualTimeTotal"] = (dfMetadataObserver["dtmManualTimeTotal"]).astype(str)

In [16]:
### Reset dfMetadataObserver in desired order and drop unneeded fields
dfMetadataObserver = dfMetadataObserver[["globalid", "strStream", "dtmDate_Pacific", "strFullName", "strTideStart", "strWeather", "dtmManualTimeStart", "dtmManualTimeTurn", "dtmManualTimeEnd", "dtmManualTimeTotal", "strStreamFlow", "strViewingConditions", "strViewingConditionsComments", "ysnLiveFish", "ysnCarcasses", "strComments"]]

In [17]:
## Join dfMetadataObserver with sedfLiveFishLocation
dfMetadataObserverLiveFish = pd.merge(dfMetadataObserver,sedfLiveFishLocation, how="inner", left_on="globalid", right_on="parentglobalid")
## Reset dfMetadataObserverLiveFish in desired order and drop unneeded fields
dfMetadataObserverLiveFish = dfMetadataObserverLiveFish[['globalid_x', 'strStream', 'dtmDate_Pacific', 'ysnLiveFish', 'globalid_y', 'strLiveSpecies', 'strLiveSex', 'ysnPairs', 'ysnReddBuilding', 'intNumRedds', 'strLiveFishRedd', 'strReddID', 'SHAPE']]
## Define dfMetadataObserverLiveFish sort order
dfMetadataObserverLiveFish = dfMetadataObserverLiveFish.sort_values(by=["strStream", "dtmDate_Pacific"])
dfMetadataObserverLiveFish

Unnamed: 0,globalid_x,strStream,dtmDate_Pacific,ysnLiveFish,globalid_y,strLiveSpecies,strLiveSex,ysnPairs,ysnReddBuilding,intNumRedds,strLiveFishRedd,strReddID,SHAPE
30,2a9082d2-c328-4ff2-abbe-92f5e570beca,Chum Creek,10/29/2021,yes,f21bc354-54a0-4053-b842-d6836ba5fcba,Oncorhynchus keta,F,no,no,,,,"{'x': -123.9406076513336, 'y': 46.360168926898..."
31,2a9082d2-c328-4ff2-abbe-92f5e570beca,Chum Creek,10/29/2021,yes,cb55e069-b695-41a3-919a-d44596dcb116,Oncorhynchus keta,M,no,no,,,,"{'x': -123.9406843107998, 'y': 46.360182041407..."
32,2a9082d2-c328-4ff2-abbe-92f5e570beca,Chum Creek,10/29/2021,yes,422add0f-0c04-43d3-b5af-c9a1f7e7ef3f,Oncorhynchus keta,Unk,no,no,,,,"{'x': -123.94089342770867, 'y': 46.36016371366..."
33,2a9082d2-c328-4ff2-abbe-92f5e570beca,Chum Creek,10/29/2021,yes,ec493b7a-944a-46c5-815c-d89b6022a102,Oncorhynchus keta,Unk,no,no,,,,"{'x': -123.94107736322806, 'y': 46.36019102213..."
34,2a9082d2-c328-4ff2-abbe-92f5e570beca,Chum Creek,10/29/2021,yes,48fe1ec5-5e26-4cd0-a080-68f915e6b0c7,Oncorhynchus keta,M,yes,no,,,,"{'x': -123.94129174277744, 'y': 46.36032236049..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,7e583a8a-6d07-4a49-9adc-71a5d4697b4f,North Creek,11/17/2021,yes,3a447e78-fdc8-4da3-b989-9af0590209bd,Oncorhynchus kisutch,F,no,yes,,Live Fish,,"{'x': 0, 'y': 0, 'spatialReference': {'wkid': ..."
449,4a3f2536-f169-4858-8fb8-5a060f3c96d6,North Creek,11/29/2021,yes,6b44b7fb-b4d8-4bd9-a64d-9edfa70b527f,Oncorhynchus kisutch,Unk,no,no,,Live Fish,,"{'x': 0, 'y': 0, 'spatialReference': {'wkid': ..."
450,a33bab8a-c6eb-494c-9156-19cec4d5abf6,North Creek,12/16/2021,yes,322ed2da-d557-4553-adb3-fba40c600bfb,Oncorhynchus kisutch,F,no,no,,Live Fish,,"{'x': 0, 'y': 0, 'spatialReference': {'wkid': ..."
451,a33bab8a-c6eb-494c-9156-19cec4d5abf6,North Creek,12/16/2021,yes,23c81774-6e26-4e86-bce2-e345d6fb5623,Oncorhynchus kisutch,Unk,no,no,,Live Fish,,"{'x': 0, 'y': 0, 'spatialReference': {'wkid': ..."


In [18]:
## Join dfMetadataObserver with sedfCarcassLocation
dfMetadataObserverCarcasses = pd.merge(dfMetadataObserver,sedfCarcassLocation, how="inner", left_on="globalid", right_on="parentglobalid")
## Reset dfMetadataObserverCarcasses in desired order and drop unneeded fields
dfMetadataObserverCarcasses = dfMetadataObserverCarcasses[['globalid_x', 'strStream', 'dtmDate_Pacific', 'ysnCarcasses', 'globalid_y', 'strCarcassSpecies', 'strCarcassSex', 'strDecomposedFresh', 'intNumCarcasses', 'ysnCountedLast', 'SHAPE']]
## Define dfMetadataObserverCarcasses sort order
dfMetadataObserverCarcasses = dfMetadataObserverCarcasses.sort_values(by=["strStream", "dtmDate_Pacific"])
dfMetadataObserverCarcasses

Unnamed: 0,globalid_x,strStream,dtmDate_Pacific,ysnCarcasses,globalid_y,strCarcassSpecies,strCarcassSex,strDecomposedFresh,intNumCarcasses,ysnCountedLast,SHAPE
0,2a9082d2-c328-4ff2-abbe-92f5e570beca,Chum Creek,10/29/2021,yes,ece3cc4a-8294-4c80-81a4-bc76e3066b12,Oncorhynchus keta,Unk,Decomposed,,no,"{'x': -123.94171459905576, 'y': 46.36027177949..."
14,b18399a6-4756-4f9a-8700-3bd39bcf24a7,Chum Creek,11/05/2021,yes,575195ae-8b60-4d0a-a2ad-c5b89534a7ce,Oncorhynchus keta,M,Fresh,1.0,no,"{'x': -123.94091809920552, 'y': 46.36004123088..."
15,b18399a6-4756-4f9a-8700-3bd39bcf24a7,Chum Creek,11/05/2021,yes,e180c4a5-5781-4884-9bd4-a2b400221c74,Oncorhynchus keta,M,Decomposed,1.0,no,"{'x': -123.94099283331896, 'y': 46.36007151952..."
89,1105e0ef-cd4d-4202-a195-a1337029aa7c,Chum Creek,11/16/2021,yes,631d0a7d-bb57-439a-9747-c182c48ec6a3,Oncorhynchus keta,F,Decomposed,2.0,no,"{'x': -123.94042512406746, 'y': 46.36007616326..."
90,1105e0ef-cd4d-4202-a195-a1337029aa7c,Chum Creek,11/16/2021,yes,93a01f5c-cc11-4f4c-907d-2c5f4c1fac21,Oncorhynchus keta,F,Fresh,1.0,no,"{'x': -123.94159108158456, 'y': 46.36020586411..."
...,...,...,...,...,...,...,...,...,...,...,...
219,474149bd-098f-4a5b-9c18-4c77bfa71a78,Lost Creek,12/14/2021,yes,36ebb70c-6c1f-4f80-ba86-a27bf9a47143,Oncorhynchus keta,Unk,Decomposed,1.0,yes,"{'x': -123.93921236381108, 'y': 46.35909084470..."
220,474149bd-098f-4a5b-9c18-4c77bfa71a78,Lost Creek,12/14/2021,yes,fa39e2cc-504c-4a38-b34b-57368cd7d746,Unk,Unk,Decomposed,2.0,yes,"{'x': -123.9380408773776, 'y': 46.358726501248..."
221,474149bd-098f-4a5b-9c18-4c77bfa71a78,Lost Creek,12/14/2021,yes,83bf224f-f75e-4044-b813-469bcbaac7a7,Unk,Unk,Decomposed,1.0,,"{'x': -123.9378167546076, 'y': 46.358710608907..."
222,474149bd-098f-4a5b-9c18-4c77bfa71a78,Lost Creek,12/14/2021,yes,0223f6fe-769b-4dfe-a261-f92bcda607ed,Oncorhynchus keta,M,Decomposed,1.0,yes,"{'x': -123.93637331650233, 'y': 46.35780280272..."


In [19]:
### Export data frames
## Use archive_dt_field function to convert Python date time into format Excel can read more easily
archive_dt_field(dfMetadataObserver)
archive_dt_field(dfMetadataObserverLiveFish)
archive_dt_field(dfMetadataObserverCarcasses)

## Create export paths for backup and writes to Excel spreadsheet
writer = pd.ExcelWriter(os.path.join(out_workspace,('WLP_Salmon_Spawning_Survey_' + year + '_' + timestamp + '.xlsx')))
dfMetadataObserver.to_excel(writer, 'Metadata')
dfMetadataObserverLiveFish.to_excel(writer, 'Live Fish')
dfMetadataObserverCarcasses.to_excel(writer, 'Carcasses')
writer.save()