### Create the process for merging new or corrected site data

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import folium
# use these if you are using the pandas dataframe view, I prefer itables because it is interactive
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

# this makes dataframes viewable as interactive tables with search and sort
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

import itables.options as opt
opt.maxBytes = 0 ### this removes size limits for interactive table


<IPython.core.display.Javascript object>

#### ** helpful links and snippets

In [None]:
# os.getcwd() # check what the working directory is

# keyboard shortcuts
# https://noteable.io/blog/jupyter-notebook-shortcuts-boost-productivity/

### set working directory to the github repo directory
this is where you change this so that it works on your computer too

In [2]:
os.chdir('/Users/sarahodges/spatial/SAVI/hudson_access/data/published')

### Import existing site points data

 - this is actually the **hap_site_points_fieldupdates_surveyintegration_8-2-23.geojson** data from sara eichner, with the new entries removed from the dataset
 - using this to create the process of site data updates

In [3]:
current_site_points = gpd.read_file('data/hap_site_points_20230804.geojson')
# len(current_site_points.columns)
current_site_points

site_id,act_codes,access_id,site_name,site_label,site_address,site_description,hours_info,open_close_date,fee,fee_amount,public_transit,public_transit_description,url_public,phone_public,email_public,site_manager,phone_site_manager,email_site_manager,access_type,access_type_other,accessibility_description,safety,use_limits,water_depth_est,program_YN,program_name,program_description,program_hours,program_id,program_phone,program_url,program_contact,amenities_description,restrooms,changing_station,food,drinking_water,walking_trails,equipment_rental,boat_access,boat_launch_YN,bike_path_accessible,bike_path_access_description,bike_parking_rack,picnic_area,playground,parking,parking_description,pets_allowed,wheelchair_access_amenities,wheelchair_access_restrooms,wheelchair_access_trails,SWIM_YN,informal_swimming,lifeguard_SWIM,safety_SWIM,showers_SWIM,FISH_YN,fish_species_FISH,walking_path_FISH,permit_FISH,HPBL_YN,difficulty_level_HPBL,distance_parking_to_launch_HPBL,boat_launch_type_HPBL,boat_storage_HPBL,trailer_parking_HPBL,safety_HPBL,MPBL_YN,difficulty_level_MPBL,boat_launch_type_MPBL,distance_parking_to_launch_MPBL,boat_cleaning_requirements_MPBL,boat_inspections_MPBL,boat_storage_MPBL,boat_storage_overnight_MPBL,haul_out_MPBL,navigational_notes_MPBL,trailer_parking_MPBL,pump_out_MPBL,safety_MPBL,site_name_photo_01,site_name_photo_02,site_name_photo_03,photo_credits,source,sq_acres,status,owner,owner_type,municipality,county,state,waterbody,natural_no,water_quality_monitoring,typology,CreationDate,Creator,EditDate,Editor,GlobalID,ObjectID,lat,lon,x,y,geometry
Loading... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
### the site score was taken out of the current site points - see published 20230629 if you want to retrieve it
#### create a site_score dataframe and remove from data
# site_score = current_site_points.copy()[['site_id', 'site_score']]
#site_score.head()
# site_points.columns.to_list()

### Make corrections to existing site points data

In [None]:
# Filtering and Mutating operations
site_points = (
    current_site_points
    .query("site_id != 'remove dupe'")
    .drop(columns='site_score')  # Remove the 'site_score' column
    .assign(
        site_name=current_site_points["site_name"].where(current_site_points["site_name"] != "Fair Haven Rd & Fairwaters Lane", "Fair Haven Pier"),
        site_id=current_site_points["site_id"].where(current_site_points["site_name"] != "Fair Haven Rd & Fairwaters Lane", 540232),
        site_description=current_site_points['site_description'].str.replace('This site is considered an historic landmark', ''),
        url_public = current_site_points['url_public'].str.split(",").str[0]  ### keep only the first url if there is nore than one 
    )
)

# remove the extra Bayswater Point State Park
site_points = site_points[site_points["site_id"] != "159127"]
site_points.loc[site_points['site_id'] == '154', 'url_public'] = "https://www.nycgovparks.org/parks/leon-s-kaiser-park"

# site_points['site_id'] = pd.to_numeric(site_points['site_id'], errors='coerce')  
### the highest site_id (other than one that ends in 999 for a specific reason, is 900136)

site_points_df = pd.DataFrame(site_points.drop(columns='geometry'))
site_points_df

## IMPORT survey data from summer site visits and process
Alyssa collected these over the summer - up to July 28 using the Survey123 s

In [None]:
## add new survey data
raw_surv_new = pd.read_csv('data/data_updates/Hudson_Access_Project_Data_Survey_0.csv')

#### Function to create new act_codes based on Alyssa's YN columns

 - if none are defined as Y, then act_codes == VISUAL
 - discuss with Sara E, we decided to define them all as FISH, but when I looked at them it didn;t seem right

In [None]:
def create_new_act_codes(row):
    activities = []
    if row['FISH_YN'] == 'Y':
        activities.append('FISH')
    if row['SWIM_YN'] == 'Y':
        activities.append('SWIM')
    if row['HPBL_YN'] == 'Y':
        activities.append('HPBL')
    if row['MPBL_YN'] == 'Y':
        activities.append('MPBL')
        
    if not activities:
        activities.append('VISUAL')
    
    return ','.join(activities)

#### Handle new sites

 - create dataframe of sites that aren't in the existing dataset
 - add source info
 - add site_id
 - check and update act_codes

In [None]:
# add site_id to all the new sites

new_sites = (
    raw_surv_new
    .query("site_id == 'needs id'")
    .assign(
        source = "Alyssa, summer 23 new site"
    )
    .reset_index(drop=True)
)

new_sites['site_id'] = new_sites.index + 900137 ### the highest site_id (other than one that ends in 999 for a specific reason, is 900136)

In [None]:
new_sites['act_codes'] = new_sites.apply(create_new_act_codes, axis=1)

new_sites

#### Handle updated sites

 - create dataframe of sites that are in the existing dataset
 - add spource info
 - create dataframe of information in old hap_site_points to check if there are any issues
 - check and update act_codes

In [None]:
## create a dataframe of updated site_ids 

updated_sites = (
    raw_surv_new
    .query("site_id != 'needs id'")
    .assign(
        source = "Alyssa, summer 23 updated site",
        act_codes = raw_surv_new.apply(create_new_act_codes, axis=1)
    )
    # .filter(['site_id', 'site_name'], axis="columns") # first look at the names to check
)

#updated_sites

# this is the old sites, can use this to compare later
replaced_sites = site_points_df.merge(updated_sites[['site_id']], on='site_id', how='inner')

# explored in the console
# len(updated_sites) = 18
# len(replaced_sites) = 19
### there is a dupe in the original data, I will make sure it is removed when I bind all the dfs togther

### Create a global id - site id key for new sites to add activity points

In [None]:
### create global id - site id key to join activity point locations added in survey to the current activity points
updated_temp = updated_sites[['GlobalID', 'site_id', 'site_name']].copy()
new_temp = new_sites[['GlobalID', 'site_id', 'site_name']].copy()
global_key = pd.concat([updated_temp, new_temp], ignore_index=True)
global_key

## Create updated hap_site_points

 - use antijoin to remove the sites that have been updated
 - concat to combine old, updated, and new

In [None]:
# antijoin
sites_no_update = site_points_df.merge(updated_sites[['site_id']], how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis="columns")

### test to determine if the antijoin worked as predicted, should == 0
len(site_points_df) - len(replaced_sites) - len(sites_no_update)

In [None]:
new_hap_site_points_temp = pd.concat([sites_no_update, updated_sites], ignore_index=True)

#### checks

len(new_hap_site_points_temp)
# 750, good, it removed the dupe in the og data

In [None]:
new_hap_site_points = pd.concat([new_hap_site_points_temp, new_sites], ignore_index=True)
len(new_hap_site_points)
# 763 = correct

### Checks to ensure that the new data will work with the website

 - act_codes have no spaces
 - all sites have act_codes
 - all sites have site_id

In [None]:
### remove spaces in the act_codes

new_hap_site_points['act_codes'] = new_hap_site_points['act_codes'].str.replace(r'\s*,\s*', ',')


In [None]:
### check for spaces or NAs - will need to figure out what to do about VISUAL and STE

new_hap_site_points['act_codes'].unique()

In [None]:
### count the NA site_ids

new_hap_site_points['site_id'].isna().sum() 

In [None]:
new_hap_site_points

## Create spatial dataframe

 - make map; red points are new or updated

In [None]:
hap_site_points = gpd.GeoDataFrame(new_hap_site_points, geometry=gpd.points_from_xy(new_hap_site_points['x'], new_hap_site_points['y']), crs="EPSG:4326")

In [None]:
m = folium.Map(
    location=[
        hap_site_points['y'].mean(), 
        hap_site_points['x'].mean()
    ], 
    zoom_start=10,
)

#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in hap_site_points.itertuples():
        popup_text = f"Site Name: {row.site_name}<br>Activity Codes: {row.act_codes}"
        color = 'blue' if pd.isna(row.Creator) else 'red'
    
        folium.CircleMarker(
            location=[row.y,  row.x],
            radius=4,
            popup=popup_text,
            color=color,  # Change the color of the circle outline
            fill=True,
            fill_color='blue'
        ).add_to(m)
    
m   

### write out the geojson

In [None]:
# hap_site_points.to_file('data/hap_site_points_20230804.geojson', driver='GeoJSON')

### Import acitivity points to update with new survey data

In [None]:
current_act_points = gpd.read_file('data/data_updates/correcting_activity_points_20230814/hap_activity_points_20230815.geojson')

act_points = (
    current_act_points
#     .assign(
#         lon = raw_act_points['geometry'].apply(lambda geom: geom.x),
#         lat = raw_act_points['geometry'].apply(lambda geom: geom.y)
#     )
    .drop(columns=['geometry'])
     [['act_id', 'access_id', 'site_id', 'site_name', 'activity', 'access_name', 'lon', 'lat']]
)
act_points

In [None]:
## add new survey data
raw_surv_act_new = pd.read_csv('data/data_updates/activity_loc_point_1.csv')

In [None]:
raw_surv_act_new

In [None]:
# this is the old sites, can use this to compare later
survey_acts = (
    raw_surv_act_new
    .merge(global_key, left_on='ParentGlobalID', right_on='GlobalID', how='left')
)

survey_acts = (
    survey_acts
    .assign(
        activity = survey_acts.apply(lambda row: row['act_codes'] if pd.isna(row['act_code']) else row['act_code'], axis=1),
        access_name = survey_acts.apply(lambda row: row['site_name'] if pd.isna(row['access_name']) else row['access_name'], axis=1),
        access_id = survey_acts.index + 1800, ### the highest 
    )
    .rename(columns={'x': 'lon', 'y': 'lat'})
     [['access_id', 'site_id', 'site_name', 'activity', 'access_name', 'lon', 'lat']]
)

survey_acts['act_id'] = survey_acts.apply(lambda row: f"{row['access_id']}_{row['activity']}", axis=1)

survey_acts

### Check for existing activity points that are being updated and remove them from current list

In [None]:
# perform a full join of current activity points and the survey activity points and select the activities that are in both
# these will be removed from current and replaced by the surey

updated_activity_points = (
    survey_acts
    .merge(current_act_points, on=['site_id', 'activity'], how='outer', indicator=True)
    .query('_merge == "both"')
    [['site_id', 'activity']]
)

updated_activity_points

In [None]:
## perform a full join with list of activities to remove, remove merge = both

new_act_points_temp = (
    act_points
    .merge(updated_activity_points, on=['site_id', 'activity'], how='outer', indicator=True)
    .query('_merge == "left_only"')
    .drop(columns=['_merge'])
)
new_act_points_temp

In [None]:
### add the new points to the old points and make access name site name if it is na

new_act_points = pd.concat([new_act_points_temp, survey_acts], ignore_index=True)

new_act_points['access_name'] = new_act_points.apply(lambda row: row['site_name'] if pd.isna(row['access_name']) else row['access_name'], axis=1)

new_act_points

## make geospatial activity points

In [None]:
hap_act_points = gpd.GeoDataFrame(new_act_points, geometry=gpd.points_from_xy(new_act_points['lon'], new_act_points['lat']), crs="EPSG:4326")
hap_act_points = hap_act_points.drop(columns=['lon', 'lat'])

hap_act_points

In [None]:
act_m = folium.Map(
    location=[
        hap_act_points['lat'].mean(), 
        hap_act_points['lon'].mean()
    ], 
    zoom_start=10,
)

#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in hap_act_points.itertuples():
        popup_text = f"Site Name: {row.site_name}<br>Activity Codes: {row.activity}"
#         color = 'blue' if pd.isna(row.Creator) else 'red'
    
        folium.CircleMarker(
            location=[row.lat,  row.lon],
            radius=4,
            popup=popup_text,
            color='blue',  # Change the color of the circle outline
            fill=True,
            fill_color='blue'
        ).add_to(act_m)
    
# act_m 

## check for sites with no activity points

In [None]:
# site_activity
site_activity_join = (
    hap_site_points[['site_id', 'site_name', 'source']]
    .rename(columns={'site_name': 'site_name_site'})
    .merge(hap_act_points, on='site_id', how='outer')
)

# no_acts
no_acts = site_activity_join[site_activity_join['act_id'].isna()]

no_acts
### there are 17

## write out act points

In [None]:
hap_act_points.to_file('data/hap_act_points_20230815_2.geojson', driver='GeoJSON')