In [2]:
import multiprocessing
import pandas as pd
import numpy as np
import scipy.stats as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.collections import PatchCollection
import dask.dataframe as dd
from dask.diagnostics import ProgressBar


multiprocessing.cpu_count()

16

Read in the data and clean it up

In [11]:
# path to file
path = '/home/jovyan/work/UrbanForest/all_clean_LAcounty_sunset.hdf'

# read the hdf
la = pd.read_hdf(path, key='data')

# select desired columns
cols=['ID', 'LATITUDE', 'LONGITUDE', 'DBH_LO', 'DBH_HI', 'CREATED',
      'UPDATED', 'SOURCE', 'Name_matched', 'Zone']
la = la[cols]

# drop NAs
la.dropna(how='any', axis=0, subset=['DBH_LO', 'DBH_HI'], inplace=True)

# capitalize genus names
la['Name_matched'] = la.Name_matched.str.capitalize()

# convert DBH to cm
la['dbh_low']  = 2.54 * la.DBH_LO
la['dbh_high'] = 2.54 * la.DBH_HI
la.drop(['DBH_LO', 'DBH_HI'], axis=1, inplace=True)

# Change date fields to dateTime type
la['created'] = pd.to_datetime(la.CREATED)
la['updated'] = pd.to_datetime(la.UPDATED)
la.drop(['CREATED', 'UPDATED'], axis=1, inplace=True)


 We will first use allometric equations from :

 McPherson, E. Gregory; van Doorn, Natalie S.; Peper, Paula J. 2016. Urban tree database.
 Fort Collins, CO: Forest Service Research Data Archive. Updated 21 January 2020.
 https://doi.org/10.2737/RDS-2016-0005

 'Apps min' and 'Apps max' give the input range (cm) that the authors feel 
  that the equations are reliable
 'InlEmp' and 'SoCalC' are Climate zones where the eqs are different.
  SoCalC reference city is Santa Monica, InlEmp is Claremont,
  see Table 1, p16 for further Climate zone details.  
  
  After reading the equations and coefficients, we will get rid of trees that only occur a few times, and trees that we o not have equations for.

In [15]:
# The equations
def mcpherson_eqs():
    '''returns dict of equations from table 3 (p24) of McPherson 2020
    functions use np so as to be vectorized'''

    eq_dict = {'lin'        : (lambda a, b, c, d, e, x, mse: a + b * (x)), 
                'quad'      : (lambda a, b, c, d, e, x, mse: a + b * x + c * x**2),
                'cub'      : (lambda a, b, c, d, e, x, mse: a + b * x + c * x**2 + d * x**3),
                'quart'     : (lambda a, b, c, d, e, x, mse:a + b * x + c *x**2 + d * x**3 + e * x**4), 
                'loglogw1' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1) + (mse/2)))),
                'loglogw2' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (np.sqrt(x) + (mse/2)))),
                'loglogw3' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (x) + (mse/2))),
                'loglogw4' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (x**2) + (mse/2))),
                'expow1'    : (lambda a, b, c, d, e, x, mse: np.exp(a+ b * (x) + (mse/2))),
                'expow2'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + np.sqrt(x) + (mse/2))),
                'expow3'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + (x) + (mse/2))),
                'expow4'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + (x**2) + (mse/2)))}

    return(eq_dict)

eq_dict = mcpherson_eqs()

# The cooeficients
coef_df = pd.read_csv('TS6_Growth_coefficients.csvx',
usecols=['Region', 'Scientific Name', 'Independent variable', 'Predicts component ', 'EqName', 'Units of predicted components',
'EqName', 'a', 'b', 'c', 'd', 'e', 'Apps min', 'Apps max'])

# Find all the trees with over 100 occurances in the dataset
trees = la.Name_matched.value_counts()
trees = list(trees.where(trees > 100).dropna().index)

# drop trees we do not have equations for
trees = [s for s in trees if s in coef_df['Scientific Name'].unique()]
la = la.loc[la.Name_matched.isin(trees)]

la.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 664155 entries, 0 to 1089845
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ID            664155 non-null  int64         
 1   LATITUDE      664155 non-null  float64       
 2   LONGITUDE     664155 non-null  float64       
 3   SOURCE        664155 non-null  object        
 4   Name_matched  664155 non-null  object        
 5   Zone          663384 non-null  float64       
 6   dbh_low       664155 non-null  float64       
 7   dbh_high      664155 non-null  float64       
 8   created       28472 non-null   datetime64[ns]
 9   updated       28472 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(5), int64(1), object(2)
memory usage: 55.7+ MB


The USGS lidar data is hosted on amazon, so we will nned the AWS client to access it.

In [22]:
!curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip"
!unzip awscli-bundle.zip 
!./awscli-bundle/install -b ~/bin/aws

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 16.8M  100 16.8M    0     0  15.3M      0  0:00:01  0:00:01 --:--:-- 15.3M
Archive:  awscli-bundle.zip
  inflating: awscli-bundle/install   
  inflating: awscli-bundle/packages/rsa-3.4.2.tar.gz  
  inflating: awscli-bundle/packages/docutils-0.15.2.tar.gz  
  inflating: awscli-bundle/packages/futures-3.3.0.tar.gz  
  inflating: awscli-bundle/packages/PyYAML-5.2.tar.gz  
  inflating: awscli-bundle/packages/colorama-0.4.1.tar.gz  
  inflating: awscli-bundle/packages/urllib3-1.25.7.tar.gz  
  inflating: awscli-bundle/packages/six-1.15.0.tar.gz  
  inflating: awscli-bundle/packages/botocore-1.19.57.tar.gz  
  inflating: awscli-bundle/packages/jmespath-0.10.0.tar.gz  
  inflating: awscli-bundle/packages/virtualenv-16.7.8.tar.gz  
  inflating: awscli-bundle/packages/colorama-0.4.3.tar.gz  
  inflating: awscli-bundle/packages/awscli-1

Let's make a tmp directory too

In [28]:
# make a tmp directory
! mkdir /home/jovyan/work/tmp

# make a variable for its path
tmp = '/home/jovyan/work/tmp'

# make a variable with the path to aws cli
aws = '/home/jovyan/bin/aws'

Download the top level ept json for the ```USGS_LPC_CA_LosAngeles_2016_LAS_2018``` dataset.

In [29]:
import subprocess
cmd = f'{aws} s3 cp s3://usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json {tmp} --no-sign-request'
subprocess.run(cmd, shell=True, capture_output=True)

CompletedProcess(args='/home/jovyan/bin/aws s3 cp s3://usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json /home/jovyan/work/tmp --no-sign-request', returncode=0, stdout=b'Completed 2.4 KiB/2.4 KiB (8.9 KiB/s) with 1 file(s) remaining\rdownload: s3://usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018/ept.json to ../tmp/ept.json\n', stderr=b'')

We will load ```ept.json``` and extract usefull information

In [32]:
import json
with open(f'{tmp}/ept.json') as f:
    meta = json.load(f)

In [53]:
bounds = meta['bounds']
srs    = meta['srs']
span   = meta['span']
schema  = meta['schema']
srs

{'authority': 'EPSG',
 'horizontal': '3857',
 'wkt': 'PROJCS["WGS 84 / Pseudo-Mercator",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Mercator_1SP"],PARAMETER["central_meridian",0],PARAMETER["scale_factor",1],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["X",EAST],AXIS["Y",NORTH],EXTENSION["PROJ4","+proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext +no_defs"],AUTHORITY["EPSG","3857"]]'}

The above output tells us the data is in EPSG:3857.  There is only a horizontal code present.  Lets reduce the srs to a more sueful form for later

In [54]:
srs = meta['srs']['authority'] + ':' + meta['srs']['horizontal']

Because the srs is EPSG:3857 (Pseudo-Mercator) I don't think it necesaary to reproject.  We do need to use the scale and offset values to find absolute position according to ``read_value * scale + offset```

In [77]:
def bag_scale_offset(name, schema):
    '''Retruns scale and offset for the spatial dimension given by name'''
    for thing in schema:
        if thing['name'] == name:
            return(thing['scale'], thing['offset'])
        
x_scale, x_offset = bag_scale_offset('X', schema)
y_scale, y_offset = bag_scale_offset('Y', schema)
z_scale, z_offset = bag_scale_offset('Z', schema)

def rescale(lon, lat, elev=None):
    '''Returns point rescaled to the ept coords'''
    x = lon * x_scale + x_offset
    y = lat * y_scale + y_offset
    if elev:
        z = elev * z_scale + z_offset
        return(x, y, z)
    return(x, y)

In [51]:
la.head()

Unnamed: 0,ID,LATITUDE,LONGITUDE,SOURCE,Name_matched,Zone,dbh_low,dbh_high,created,updated
0,712088,34.086805,-118.384081,A Plus,Schinus molle,22.0,7.62,15.24,2018-09-19,2018-09-19
4,712126,34.086579,-118.383939,A Plus,Olea europaea,22.0,30.48,45.72,2018-09-19,2018-09-19
5,712134,34.086542,-118.383912,A Plus,Olea europaea,22.0,30.48,45.72,2018-09-19,2018-09-19
8,712154,34.086874,-118.383809,A Plus,Olea europaea,22.0,30.48,45.72,2018-09-19,2018-09-19
11,712164,34.086845,-118.383785,A Plus,Olea europaea,22.0,30.48,45.72,2018-09-19,2018-09-19


Now it should be possible to define a bounding box around a tree to query the pointcloud at that local.

In [78]:
# For now we will ad 0.00007 degrees in each direction, this is jus a guess based on 5th decimal place ~ 1.1m
# also not setting z max and min for the moment
def make_scaled_bbox(lat, lon):
    buf = 0.00007
    xmin = lon - buf
    ymin = lat - buf
    xmax = lon + buf
    ymax = lat + buf
    xmin, ymin = rescale(xmin, ymin)
    xmax, ymax = rescale(xmax, ymax)
    return([[xmin, xmax], [ymin, ymax]])

def make_bbox(lat, lon):
    buf = 0.00007
    xmin = lon - buf
    ymin = lat - buf
    xmax = lon + buf
    ymax = lat + buf
    return([[xmin, xmax], [ymin, ymax]])

    

We will performa quick sanity check using the first entry of the LA data

In [88]:
# get lat lon of first entry
row = la.loc[la.ID==712088]
lat = row['LATITUDE'].values[0]
lon = row['LONGITUDE'].values[0]
print(f'lat is {lat}\nlon is {lon}')

# make bbox in EPSG:3857
bbox = make_bbox(lat, lon)
print(f'bbox is {bbox}')

# make bbox in the ept coord system
scaled_bbox = make_scaled_bbox(lat, lon)
print(f'scaled bbox is {scaled_bbox}')

# print the bounds of the ept
print(f'ept bounds are {bounds}')

# sanity check will raise erros if scalled bbox is not in the ept bounds
assert (scaled_bbox[0][0] > bounds[0]) & (scaled_bbox[0][1] < bounds[3])
assert (scaled_bbox[1][0] > bounds[1]) & (scaled_bbox[1][1] < bounds[4])
print('Seems fine.')

lat is 34.086805413050996
lon is -118.38408051393
bbox is [[-118.38415051393, -118.38401051393001], [34.086735413050995, 34.086875413051]]
scaled bbox is [[-13168196.183841506, -13168196.183840105], [4037456.340867354, 4037456.3408687543]]
ept bounds are [-13271559, 3934093, -102275, -13064831, 4140821, 104453]
Seems fine.


In [None]:
!conda install -c --yes conda-forge python-pdal 

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.2
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - python-pdal


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.12.5  |       ha878542_0         137 KB  conda-forge
    certifi-2020.12.5          |   py37h89c1867_1         143 KB  conda-forge
    libgcc-ng-9.3.0            |      h2828fa1_18         7.8 MB  conda-forge
    libstdcxx-ng-9.3.0         |      h6de172a_18         4.0 MB  conda-forge
    openssl-1.1.1i             |       h7f98852_0         2.1 MB  conda-forge
    python-pdal-2.3.7          |   py37h2527ec5_0         262 KB  conda-forge
    ------------------------------------------------------------
                  

Now lets try to get the point cloud within the bbox using PDALs ept reader

In [95]:
import pdal
from string import Template

t =  Template('''
    {
        "pipeline": [
        {
        "bounds": "(${scaled_bbox})",
        "filename": "https://s3-us-west-2.amazonaws.com/usgs-lidar-public/USGS_LPC_CA_LosAngeles_2016_LAS_2018",
        "type": "readers.ept",
        "tag": "readdata"
        }
        ]
    }''')



pipe = t.substitute(scaled_bbox=scaled_bbox)

pipeline = pdal.Pipeline(pipe)
pipeline.validate()
count = pipeline.execute()
S = pipeline.arrays[0]
metadata = pipeline.metadata
log = pipeline.log
sh = S.shape
print(f'S is a numpy structured array of shape {sh}.')

ModuleNotFoundError: No module named 'pdal'