# Cleaning East River Stem Geolocation Shapefiles
**Author:** 'Marshall Worsham' <br>
**Creation Date:** '09/21/2020' <br>
**Revision Date:** '12/22/2020' <br>

---

## Table of Contents

1 - [Front matter](#front)<br>
2 - [Import stem geolocation shapefiles](#import)<br>

---

## Front matter<a id='front'></a>

This notebook contains markdown and code for post-processing point shapefiles generated from Trimble Geo7X GPS acquisitions in the East River domain. The input is a set of shapefiles containing tree geolocation points, one set for each site in the watershed where stem geolocations were acquired from 2018–2020.

The script appends the `Site` name and `subdirectory` to each shapefile name, then selects all projected point shapefiles, groups them by `Site` name, and merges points from the same site. It then filters out undesired points (e.g., plot corners and plot edges)

The script was developed in `Python 3.8.2` on a Macbook Pro 2014 running OSX 10.14.6.


### Libraries<a id='libraries'></a>

In [None]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import math
import re
from matplotlib import pyplot as plt
from os.path import join, getsize
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

### Define the working directories and list contents

In [None]:
directory = os.sep.join(['/Volumes', 'GoogleDrive', 'My Drive', 'Research', 'RMBL', 'Working_Files', 'Forest_Inventory_Dataset'])
source_dir = os.sep.join([directory, 'Source'])
scratch_dir = os.sep.join([directory, 'Scratch'])
out_dir = os.sep.join([directory, 'Output'])
stem_dir = os.sep.join([scratch_dir, 'GPS_Data_2021_MERGEDBYPLOT'])
os.listdir(stem_dir)

### Set pandas view to max rows and columns

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

## Import stem geolocation shapefiles<a id='import'></a>


In [None]:
# import the temp geodataframes from the append operations
stem_files = os.listdir(stem_dir)
stem_sf = [s for s in stem_files if s.endswith('.shp')]
stem_paths = [stem_dir + s for s in stem_sf]
appd_gpdf = []
for sf in stem_sf:
    gpdf = gpd.read_file(os.sep.join([stem_dir, sf]))
    appd_gpdf.append(gpdf)

In [None]:
# Set a figure frame
plt.figure(figsize = (30,15))

# Plot the dataframes in sequence on the figure frame
for i, gpdf in enumerate(appd_gpdf):
    
    # create subplot axes in a 5x3 grid
    ax = plt.subplot(5, 4, i + 1) # nrows, ncols, axes position
    
    # set title
    ax.set_title(gpdf.Site[0])
    
    # plot the plot on these axes
    gpdf.plot(ax=ax)

plt.tight_layout();

## Some functions to interact

In [None]:
# print ordered list of new geodataframes
print([i for i in stem_files if i.endswith('.shp')])

In [None]:
# # function to interact with dataframe
# def fh_interact(df):
#     '''
#     outputs sliders that show rows and cols of df
#     '''
#     def peek(row = 1, col = 0):
#         return df.iloc[row: row+10, col: col+10]
#     interact(peek, row = (0, len(df), 5), col = (0, len(df.columns) - 6))
#     print('({} rows, {} columns total'.format(df.shape[0], df.shape[1]))

## Define a function to generate new points based on geotag association notes in field notebooks and GPS units
For some trees that were growing very closely together, we did not geotag each tree individually with the GPS unit in order to save battery and complete the inventory. Instead, we left directional notes in the `Other` or `Other2` field. These comments had the format, e.g. "6030 0.4m NW". This indicated that for a given geotagged tree, #6030 was an ungeotagged associated tree standing 0.4m to the northwest of the geotagged tree. We define a function below that uses those notes to construct geolocation points for the trees that were associated through these notes.

In [None]:
def makenewpoint(associationdf, reftag, targtag, direction, distance):
    '''
    Creates new point geometries for stems that weren't geotagged but have directional and distance references in notes. 
    Inputs:
        - gpdf: geodataframe storing geometry and association data
        - reftag: an integer indicating the reference (geotagged) tree tag number
        - targtag: an integer indicating the target (untagged associated) tree tag number
        - dir: string indicating cardinal direction from reference to target (e,n,w,s,ne,nw,sw,se) noted in 'Other' or 'Other2' field
        - distance: float indicating distance from reference to target noted in 'Other' or 'Other2' field
    Returns:
        - geopandas entry with point geometry
    '''

    # Conversion from degrees to radians
    rad = np.pi/180

    # Calculate cardinal directions as radian angles from east-facing origin
    erad = rad*0
    nrad = rad*90
    wrad = rad*180
    srad = rad*270
    nerad = rad*45
    nwrad = rad*135
    swrad = rad*225
    serad = rad*315

    # Define radian direction based on cardinal direction input
    if not isinstance(direction, float):
        if direction.lower() == 'e':
            raddir = erad
        elif direction.lower() == 'n':
            raddir = nrad
        elif direction.lower() == 'w':
            raddir = wrad
        elif direction.lower() == 's':
            raddir = srad
        elif direction.lower() == 'ne':
            raddir = nerad
        elif direction.lower() == 'nw':
            raddir = nwrad
        elif direction.lower() == 'sw':
            raddir = swrad
        elif direction.lower() == 'se':
            raddir = serad
    # elif isinstance(direction, float):
    else:
        intrad = rad*direction
        raddir = intrad

    # Find x,y coordinates of reference tag number
    refcoords = associationdf[associationdf['Tag_Number'] == reftag].geometry
    x = refcoords.x
    y = refcoords.y

    # Calculate coordinates of target tag number
    x_prime = distance * np.cos(raddir) + x
    y_prime = distance * np.sin(raddir) + y
    
    # print(targtag, raddir, x, y, x_prime, y_prime, sep='\n')

    # Create a new temp gpdf from new geometries   
    new_gdf = gpd.GeoDataFrame(geometry = gpd.points_from_xy(x_prime, y_prime, crs = 'epsg:32612'))
    new_gdf['Tag_Number'] = targtag
    new_gdf['Comment'] = 'Point generated post-campaign from geotag association note'
    new_gdf['Geotag_Association'] = reftag
    new_gdf = new_gdf[['Tag_Number', 'Comment', 'Geotag_Association', 'geometry']]

    return new_gdf

---
## Cleaning function


In [None]:
def cleansf(geodf, invsheet_id):
    '''
    TODO
    '''
    
    # Prep inventory datasheet for merge
    invsheet = f'https://docs.google.com/spreadsheets/d/{invsheet_id}/export?format=csv'
    invdata = pd.read_csv(invsheet)
    invdata['Tag_Number'] = pd.to_numeric(invdata['Tag_Number'], downcast='integer')
    
    # Prep points geodataframe for merge
    gcrs = geodf.crs
    geodf = pd.DataFrame(geodf)
    geodf['Tag_Number'] = pd.to_numeric(geodf['Other'], downcast='integer')

    # Merge the inventory and points dataframes
    merged = invdata.merge(geodf, on='Tag_Number', how='left')
    merged['Geotag_Association_Ref'] = pd.to_numeric(merged['Geotag_Association_Ref'], downcast='integer')
    merged = gpd.GeoDataFrame(merged, crs = gcrs, geometry = merged['geometry'])

    # return merged

    # 
    def getinputs(df, col, testcol='Geotag_Association_Ref'):
            return [df[col][i] for i in df.index if pd.notnull(df[testcol][i])]

    reftags = pd.to_numeric(getinputs(merged, 'Geotag_Association_Ref'), downcast='integer')
    targtags = pd.to_numeric(getinputs(merged, 'Tag_Number'), downcast='integer')
    dirs = getinputs(merged, 'Geotag_Association_Dir')
    dists = getinputs(merged, 'Geotag_Association_Dist')

    assert len(reftags) == len(targtags) == len(dirs) == len(dists)
    
    newpoints_ls = []
    #iterators = zip(reftags, targtags, dirs, dists)
    
    for targtag, reftag, dirc, dist in zip(targtags, reftags, dirs, dists):
        if (pd.notnull(dirc) | pd.notnull(dist)):
            newpoint = makenewpoint(merged, reftag, targtag, dirc, dist)
            newpoints_ls.append(newpoint)
    
    geodfclean = geodf.append(newpoints_ls, ignore_index=True)
    geodfclean['Site'] = invdata['Site_Name'][0]
    geodfclean['Sp_Code'] = np.NaN
    if 'Comment' not in geodfclean:
        geodfclean['Comment'] = np.NaN
    if 'Geotag_Association' not in geodfclean:
        geodfclean['Geotag_Association'] = np.NaN
    geodfclean = geodfclean[[
        'Site',
        'Tag_Number',
        'Sp_Code',
        'Latitude',
        'Longitude',
        'GNSS_Heigh',
        'Horz_Prec',
        'Vert_Prec',
        'Std_Dev',
        'GPS_Time',
        'GPS_Date',
        'Rcvr_Type',
        'Corr_Type',
        'Max_PDOP',
        'Max_HDOP',
        'Geotag_Association',
        'Comment',
        'geometry'
        ]]

    clsv = geodfclean.sort_values(['Tag_Number', 'Horz_Prec'])
    dropidx = geodfclean[geodfclean.duplicated(subset='Tag_Number', keep='first')].index
    geodfclean.drop(dropidx, inplace=True)
    geodfclean.reset_index(drop=True, inplace = True)

    geodfclean.replace(['', 'None'], np.nan, inplace = True)
    geodfclean=gpd.GeoDataFrame(geodfclean, crs=gcrs, geometry=geodfclean.geometry)
    geodfclean['Tag_Number'] = pd.to_numeric(geodfclean['Tag_Number'], downcast='integer')
    geodfclean['Geotag_Association'] = pd.to_numeric(geodfclean['Geotag_Association'], downcast='integer')

    return geodfclean

In [None]:
def getshid(url):
    return(url.split('/')[5].split('?')[0])

In [None]:
cccvn1 = getshid('https://docs.google.com/spreadsheets/d/1beSW4hYnmxxC2X3yPkKJsqP3gg33RH6qbB9DvVVcnUE?authuser=worsham%40berkeley.edu&usp=drive_fs')
cccvn2 = getshid('https://docs.google.com/spreadsheets/d/1KqNXUVWMN76Wu3emK11ZC0JWEGafcARoyVFIXHWx7EI?authuser=worsham%40berkeley.edu&usp=drive_fs')
cccvs1 = getshid('https://docs.google.com/spreadsheets/d/1KpHlGMiAbLpT8QIccMEAgAQ2vkRZq_5d9i0jRm-S7OE?authuser=worsham%40berkeley.edu&usp=drive_fs')
ccemn1 = getshid('https://docs.google.com/spreadsheets/d/1CyY3OfzBoFVHCu1QhchbKn7Y8g1uVMQJOQMFriynvSM?authuser=worsham%40berkeley.edu&usp=drive_fs')
sgnes1 = getshid('https://docs.google.com/spreadsheets/d/1veFBxhR0wD4Qu07ZYyNNQlzKg5J9YTNL8E1VML8u_gw?authuser=worsham%40berkeley.edu&usp=drive_fs')
sgnes3 = getshid('https://docs.google.com/spreadsheets/d/1v5oJcULvbW-IYfvfLVFuCzUIT5mTbwtfYNAorns1E5M?authuser=worsham%40berkeley.edu&usp=drive_fs')
srpvg1 = getshid('https://docs.google.com/spreadsheets/d/1zc-1B8T91BEd0tc1SvgUGFot-Dj6M02p1ZLoXYCPw2o?authuser=worsham%40berkeley.edu&usp=drive_fs')
xxpln1 = getshid('https://docs.google.com/spreadsheets/d/1VQ8bGxJACttKfQ0V4Hbg3HIIybRA5Pw8yBXUpD2Kv28?authuser=worsham%40berkeley.edu&usp=drive_fs')
xxpln2 = getshid('https://docs.google.com/spreadsheets/d/1q-wH_h-WbNHHqALjGv3KzaTKHfLuYkkTHR9uzMSPtcU?authuser=worsham%40berkeley.edu&usp=drive_fs')
sgnes2 = getshid('https://docs.google.com/spreadsheets/d/1s3MBQ5UMkMlyjV1YVecahvDHZ_Oyy0EyCYSveYkg_ig?authuser=worsham%40berkeley.edu&usp=drive_fs')
ccuc1 = getshid('https://docs.google.com/spreadsheets/d/1cNTpoD0S0X1rnptilEfLWDyYxaEhrwBuIbD7XlPC3L4?authuser=worsham%40berkeley.edu&usp=drive_fs')
erbme1 = getshid('https://docs.google.com/spreadsheets/d/1FDIBHXM0Zg_X_vkc_Ryc7v4P7L-7mtdbRI53ucaPgaw?authuser=worsham%40berkeley.edu&usp=drive_fs')
sgswr1 = getshid('https://docs.google.com/spreadsheets/d/1EcrhpG2qMAwvTYdAH9J9Gqriuuiwe1PBAUAyLXnAgts?authuser=worsham%40berkeley.edu&usp=drive_fs')
wgwgm1 = getshid('https://docs.google.com/spreadsheets/d/1mlF02kwTQLeCzf9jnSpFL48cxiQdEg7elwr4tsAOPXU?authuser=worsham%40berkeley.edu&usp=drive_fs')
erapl1 = getshid('https://docs.google.com/spreadsheets/d/1CxcYtqh7jplLEr9fx7u2jOIAiTkbebrX8_M3330Abxk?authuser=worsham%40berkeley.edu&usp=drive_fs')
xxcar3 = getshid('https://docs.google.com/spreadsheets/d/1Oxz-exmF11akp2IEkO3rG1ZyIggHDrSyznZFKEpa2HI?authuser=worsham%40berkeley.edu&usp=drive_fs')
xxcar1 = getshid('https://docs.google.com/spreadsheets/d/1NJVvhj4gnI5e3hdVvwuSiC7NyIWXGwVUgZGc3Gv9Fko?authuser=worsham%40berkeley.edu&usp=drive_fs')

In [None]:
# All observations in 2021
#invsheets = [xxpln1, sgnes2, cccvs1, xxpln2, srpvg1, sgnes3, ergt1, ccuc1, xxcar3, sgnes1, cccvn2, xxcar1, cccvn1, ccemn1, erbme1, sgswr1, wgwgm1, erapl1]
#idxs = list(range(0, len(invsheets)))

In [None]:
# All observations from new sites in 2021
invsheets = [cccvs1, xxpln2, srpvg1, sgnes3, xxcar3, sgnes1, cccvn2, xxcar1, cccvn1, ccemn1]
idxs = [2, 3, 4, 5, 8, 9, 10, 11, 12, 13]
geodfs = [appd_gpdf[i] for i in idxs]

In [None]:
# Observations missed in 2020 and taken in 2021
# invsheets = [xxpln1, sgnes2, ccuc1, sgswr1, wgwgm1]
# idxs = [0,1,7,15,16]
# geodfs = [appd_gpdf[i] for i in idxs]

In [None]:
appd_gpdf[10]

In [None]:
clean_list = []
for i in range(len(invsheets)):
    print(idxs[i])
    result=cleansf(geodfs[i], invsheets[i])
    clean_list.append(result)

In [None]:
clean_list[4]

In [None]:
clean_list[4].plot()

In [None]:
for i in range(len(geodfs)):
    print('original:', geodfs[i].shape)
    print('post:', clean_list[i].shape)

---
## Assemble cleaned dataframes into list and export shapefiles

In [None]:
# Check CRS match
[clean_list[i].crs == clean_list[i+1].crs for i in np.arange(len(clean_list)-1)]

In [None]:
for i in clean_list:
    outpath = os.path.join(out_dir, 'Kueppers_EastRiver_Stem_Geolocations_WGS84UTM13N', i['Site'][0]+'_Stem_Geolocations_WGS84UTM13N.shp')
    i.to_file(outpath)

In [None]:
for i in clean_list:
    outpath = os.path.join(source_dir, 'TreeCoords', i['Site'][0]+'_Stem_Geolocations_WGS84UTM13N.csv')
    i.to_csv(outpath)

In [None]:
for i in os.listdir(os.path.join(out_dir, 'Kueppers_EastRiver_Stem_Geolocations_WGS84UTM13N')):
    if '.shp' in i:
        gdf = gpd.read_file(os.path.join(out_dir, 'Kueppers_EastRiver_Stem_Geolocations_WGS84UTM13N', i))
        outpath = os.path.join(source_dir, 'TreeCoords', i[:-4]+'.csv')
        gdf.to_csv(outpath)