In [1]:
import multiprocessing
import pandas as pd
import numpy as np
import scipy.stats as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.collections import PatchCollection
import dask.dataframe as dd
import os


multiprocessing.cpu_count()

16

Read in the data and clean it up

In [2]:
# path to file
home = os.path.expanduser('~')
path = f'{home}/UrbanForest/all_clean_LAcounty_sunset.hdf'

# read the hdf
la = pd.read_hdf(path, key='data')

# select desired columns
cols=['ID', 'LATITUDE', 'LONGITUDE', 'DBH_LO', 'DBH_HI', 'CREATED',
      'UPDATED', 'SOURCE', 'Name_matched', 'Zone']
la = la[cols]

# drop NAs
la.dropna(how='any', axis=0, subset=['DBH_LO', 'DBH_HI'], inplace=True)

# capitalize genus names
la['Name_matched'] = la.Name_matched.str.capitalize()

# convert DBH to cm
la['dbh_low']  = 2.54 * la.DBH_LO
la['dbh_high'] = 2.54 * la.DBH_HI
la.drop(['DBH_LO', 'DBH_HI'], axis=1, inplace=True)

# Change date fields to dateTime type
la['created'] = pd.to_datetime(la.CREATED)
la['updated'] = pd.to_datetime(la.UPDATED)
la.drop(['CREATED', 'UPDATED'], axis=1, inplace=True)


 We will first use allometric equations from :

 McPherson, E. Gregory; van Doorn, Natalie S.; Peper, Paula J. 2016. Urban tree database.
 Fort Collins, CO: Forest Service Research Data Archive. Updated 21 January 2020.
 https://doi.org/10.2737/RDS-2016-0005

 'Apps min' and 'Apps max' give the input range (cm) that the authors feel 
  that the equations are reliable
 'InlEmp' and 'SoCalC' are Climate zones where the eqs are different.
  SoCalC reference city is Santa Monica, InlEmp is Claremont,
  see Table 1, p16 for further Climate zone details.  
  
  After reading the equations and coefficients, we will get rid of trees that only occur a few times, and trees that we o not have equations for.

In [3]:
# The equations
def mcpherson_eqs():
    '''returns dict of equations from table 3 (p24) of McPherson 2020
    functions use np so as to be vectorized'''

    eq_dict = {'lin'        : (lambda a, b, c, d, e, x, mse: a + b * (x)), 
                'quad'      : (lambda a, b, c, d, e, x, mse: a + b * x + c * x**2),
                'cub'      : (lambda a, b, c, d, e, x, mse: a + b * x + c * x**2 + d * x**3),
                'quart'     : (lambda a, b, c, d, e, x, mse:a + b * x + c *x**2 + d * x**3 + e * x**4), 
                'loglogw1' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1) + (mse/2)))),
                'loglogw2' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (np.sqrt(x) + (mse/2)))),
                'loglogw3' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (x) + (mse/2))),
                'loglogw4' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (x**2) + (mse/2))),
                'expow1'    : (lambda a, b, c, d, e, x, mse: np.exp(a+ b * (x) + (mse/2))),
                'expow2'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + np.sqrt(x) + (mse/2))),
                'expow3'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + (x) + (mse/2))),
                'expow4'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + (x**2) + (mse/2)))}

    return(eq_dict)

eq_dict = mcpherson_eqs()

# The cooeficients
coef_df = pd.read_csv('TS6_Growth_coefficients.csvx',
usecols=['Region', 'Scientific Name', 'Independent variable', 'Predicts component ', 'EqName', 'Units of predicted components',
'EqName', 'a', 'b', 'c', 'd', 'e', 'Apps min', 'Apps max'])

# Find all the trees with over 100 occurances in the dataset
trees = la.Name_matched.value_counts()
trees = list(trees.where(trees > 100).dropna().index)

# drop trees we do not have equations for
trees = [s for s in trees if s in coef_df['Scientific Name'].unique()]
la = la.loc[la.Name_matched.isin(trees)]

la.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 664155 entries, 0 to 1089845
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ID            664155 non-null  int64         
 1   LATITUDE      664155 non-null  float64       
 2   LONGITUDE     664155 non-null  float64       
 3   SOURCE        664155 non-null  object        
 4   Name_matched  664155 non-null  object        
 5   Zone          663384 non-null  float64       
 6   dbh_low       664155 non-null  float64       
 7   dbh_high      664155 non-null  float64       
 8   created       28472 non-null   datetime64[ns]
 9   updated       28472 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(5), int64(1), object(2)
memory usage: 55.7+ MB


The USGS lidar data is hosted on amazon, so we will nned the AWS client to access it.

In [4]:
#!curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip"
#!unzip awscli-bundle.zip 
#!./awscli-bundle/install -b ~/bin/aws

Let's make a tmp directory too

In [5]:
# make a tmp directory

#! mkdir ~/tmp

# make a variable for its path
tmp = f'{home}/tmp'

# make a variable with the path to aws cli
aws = '~/bin/aws'

Download the top level ept json for the ```USGS_LPC_CA_LosAngeles_2016_LAS_2018``` dataset.

In [6]:
import subprocess
cmd = f'{aws} s3 cp s3://usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json {tmp} --no-sign-request'
subprocess.run(cmd, shell=True, capture_output=True)

CompletedProcess(args='~/bin/aws s3 cp s3://usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json /home/jovyan/tmp --no-sign-request', returncode=0, stdout=b'Completed 2.4 KiB/2.4 KiB (11.3 KiB/s) with 1 file(s) remaining\rdownload: s3://usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json to ../tmp/ept.json\n', stderr=b'')

We will load ```ept.json``` and extract usefull information

In [7]:
import json
with open(f'{tmp}/ept.json') as f:
    meta = json.load(f)
    
bounds = meta['bounds']
bounds_conf = meta['boundsConforming']
srs    = meta['srs']
span   = meta['span']
schema  = meta['schema']
srs

{'authority': 'EPSG',
 'horizontal': '3857',
 'wkt': 'PROJCS["WGS 84 / Pseudo-Mercator",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Mercator_1SP"],PARAMETER["central_meridian",0],PARAMETER["scale_factor",1],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["X",EAST],AXIS["Y",NORTH],EXTENSION["PROJ4","+proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext +no_defs"],AUTHORITY["EPSG","3857"]]'}

The above output tells us the data is in EPSG:3857.  There is only a horizontal code present.  Lets reduce the srs to a more sueful form for later

In [8]:
srs = meta['srs']['authority'] + ':' + meta['srs']['horizontal']

Because the srs is EPSG:3857 (Pseudo-Mercator) I don't think it necesaary to reproject.  We do need to use the scale and offset values to find absolute position according to ``read_value * scale + offset```

In [9]:
def bag_scale_offset(name, schema):
    '''Retruns scale and offset for the spatial dimension given by name'''
    for thing in schema:
        if thing['name'] == name:
            return(thing['scale'], thing['offset'])
        
x_scale, x_offset = bag_scale_offset('X', schema)
y_scale, y_offset = bag_scale_offset('Y', schema)
z_scale, z_offset = bag_scale_offset('Z', schema)

def rescale(lon, lat, elev=None):
    '''Returns point rescaled to the ept coords'''
    x = lon * x_scale + x_offset
    y = lat * y_scale + y_offset
    if elev:
        z = elev * z_scale + z_offset
        return(x, y, z)
    return(x, y)



In [10]:
lon_min = -118.467730111
scaled = lon_min * x_scale
print(scaled)
sc_off = scaled + x_offset
sc_off

-1.18467730111


-13168196.184677301

Now it should be possible to define a bounding box around a tree to query the pointcloud at that local.

In [11]:
# For now we will ad 0.00007 degrees in each direction, this is jus a guess based on 5th decimal place ~ 1.1m
# also not setting z max and min for the moment
def make_scaled_bbox(lat, lon, bounds=None):
    '''Returns a bbox in ept coords.
    If present bounds is of form [xmin, ymin, zmin, xmax, ymax, zmax]'''
    
    buf = 0.00007
    xmin = lon - buf
    ymin = lat - buf
    xmax = lon + buf
    ymax = lat + buf
    xmin, ymin = rescale(xmin, ymin)
    xmax, ymax = rescale(xmax, ymax)

    # make sure no bbox is out of the ept bbox
    if bounds:
        xmin = max(xmin, bounds[0])
        ymin = max(ymin, bounds[1])
        xmax = min(xmax, bounds[3])
        ymax = min(ymax, bounds[4])
    
    return([xmin, xmax], [ymin, ymax])

def make_bbox(lat, lon):
    buf = 0.00007
    xmin = lon - buf
    ymin = lat - buf
    xmax = lon + buf
    ymax = lat + buf
    return([xmin, xmax], [ymin, ymax])

    

We will performa quick sanity check using the first entry of the LA data

In [12]:
minx = -118.467730111
miny = 34.010635655
maxx = -118.467445511
maxy = 34.010856703
minx, miny = rescale(minx, miny)
maxx, maxy = rescale(-118.291042065, 34.065091929)
temp_box = ([minx, maxx], [miny, maxy])
print(temp_box)
tbbox = ([-13168200.284677301, -13168197.28291042], [4037456.3401063564, 4037459.3406509194])

([-13168196.184677301, -13168196.18291042], [4037456.3401063564, 4037456.3406509194])


In [13]:
#pip install pdal

Now lets try to get the point cloud within the bbox using PDALs ept reader

In [14]:
import pdal
from string import Template
from tqdm import tqdm
from dask import delayed
from dask import compute
from dask.diagnostics import ProgressBar

In [15]:
id = 1986759
lat = la.loc[la.ID==id]['LATITUDE'].values[0]
lon = la.loc[la.ID==id]['LONGITUDE'].values[0]
lon

-118.066793605

In [16]:

cmd = f'{aws} s3 cp s3://usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json {tmp} --no-sign-request'
subprocess.run(cmd, shell=True, capture_output=True)

print('ept.json read from s3 ...')

with open(f'{tmp}/ept.json') as f:
    meta = json.load(f)

    
print('metadata bagged ...')
bounds = meta['bounds']
bounds_conf = meta['boundsConforming']
srs = meta['srs']['authority'] + ':' + meta['srs']['horizontal']

span   = meta['span']
schema  = meta['schema']

x_scale, x_offset = bag_scale_offset('X', schema)
y_scale, y_offset = bag_scale_offset('Y', schema)
z_scale, z_offset = bag_scale_offset('Z', schema)


scaled_bbox = make_scaled_bbox(lat, lon, bounds=bounds_conf)

def bbox_geojson(lat, lon, filename):
    '''makes wgs84 bbox as geojson for comparison in gis'''
    [xmin, xmax], [ymin, ymax] = make_bbox(lat, lon)
    gjson = {'coordinates' : [[[xmin, ymin], [xmin, ymax], [xmax, ymax], [xmax, ymin]]],
            'type' : 'Polygon'}
    with open(filename, 'w') as of:
        json.dump(gjson, of)

bbox_geojson(lat, lon, 'xxx.json')        
print('geojson written ...')

# make and validate pipeline
t = Template('''
{
    "pipeline": [
        {
"bounds": "([-10425171.940, -10425171.000], [5164494.710, 5164595.200])",
"filename": "https://s3-us-west-2.amazonaws.com/usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json",
"type": "readers.ept",
"tag": "readdata"
        },
        {
            "out_srs": "EPSG:4326",
            "tag": "reprojectwgs84",
            "type": "filters.reprojection"
        },
        {
            "filename": "xxx.laz",
            "inputs": [ "reprojectwgs84" ],
            "tag": "writerslas",
            "type": "writers.las"
        },
        {
            "filename": "xxx.tif",
            "gdalopts": "tiled=yes,     compress=deflate",
            "inputs": [ "writerslas" ],
            "nodata": -9999,
            "output_type": "idw",
            "resolution": 1,
            "type": "writers.gdal",
            "window_size": 6
        }
    ]
}''')

pipe = t.substitute(scaled_bbox=scaled_bbox)
print('pipeline json written ...')
pipeline = pdal.Pipeline(pipe)
print('pipeline string truned into pipeline ...')
pipeline.validate()
print('pipeline validated ...')

 # execuite pipeline
count = pipeline.execute()
print('pipeline executed ...')
S = pipeline.arrays[0]
metadata = pipeline.metadata
log = pipeline.log

print(S.shape)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-b101fd3df172>", line 2, in <module>
    subprocess.run(cmd, shell=True, capture_output=True)
  File "/opt/conda/lib/python3.8/subprocess.py", line 491, in run
    stdout, stderr = process.communicate(input, timeout=timeout)
  File "/opt/conda/lib/python3.8/subprocess.py", line 1024, in communicate
    stdout, stderr = self._communicate(input, endtime, timeout)
  File "/opt/conda/lib/python3.8/subprocess.py", line 1866, in _communicate
    ready = selector.select(timeout)
  File "/opt/conda/lib/python3.8/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/IPython/core/inte

TypeError: object of type 'NoneType' has no len()

In [None]:
scaled_bbox

In [None]:
meta

In [15]:
@delayed
def row_bounds_ept_query(i):
    # get lat lon of first entry
    row = la.iloc[i]
    ident = row['ID']
    lat = row['LATITUDE']
    lon = row['LONGITUDE']

    # make bbox in the ept coord system
    scaled_bbox = make_scaled_bbox(lat, lon, bounds=bounds_conf)
    
    # sanity check will raise erros if scalled bbox is not in the ept bounds
    assert (scaled_bbox[0][0] > bounds[0]) & (scaled_bbox[0][1] < bounds[3])
    assert (scaled_bbox[1][0] > bounds[1]) & (scaled_bbox[1][1] < bounds[4])
    
    # make and validate pipeline
    t = Template('''
    [
        {
            "type": "readers.ept",
            "filename": "https://s3-us-west-2.amazonaws.com/usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json",
            "bounds": "${scaled_bbox}",
            "resolution": 1
        }
    ]''')

    pipe = t.substitute(scaled_bbox=tbbox)
    pipeline = pdal.Pipeline(pipe)
    pipeline.validate()
    
    # execuite pipeline
    count = pipeline.execute()
    S = pipeline.arrays[0]
    metadata = pipeline.metadata
    log = pipeline.log
    
    # do stuff
    sh = S.shape
    if sh[0] > 0:
        print(f'{ident} is a numpy structured array of shape {sh}.')
        return(S)
        
results = []
for i in range(1):
    results.append(row_bounds_ept_query(i+10))
    
with ProgressBar():
    S = compute(*results)

IndentationError: unexpected indent (<ipython-input-15-5c063dddf927>, line 18)