In [2]:
import multiprocessing
import pandas as pd
import numpy as np
import scipy.stats as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.collections import PatchCollection
import dask.dataframe as dd
from dask.diagnostics import ProgressBar


multiprocessing.cpu_count()

16

Read in the data and clean it up

In [11]:
# path to file
path = '/home/jovyan/work/UrbanForest/all_clean_LAcounty_sunset.hdf'

# read the hdf
la = pd.read_hdf(path, key='data')

# select desired columns
cols=['ID', 'LATITUDE', 'LONGITUDE', 'DBH_LO', 'DBH_HI', 'CREATED',
      'UPDATED', 'SOURCE', 'Name_matched', 'Zone']
la = la[cols]

# drop NAs
la.dropna(how='any', axis=0, subset=['DBH_LO', 'DBH_HI'], inplace=True)

# capitalize genus names
la['Name_matched'] = la.Name_matched.str.capitalize()

# convert DBH to cm
la['dbh_low']  = 2.54 * la.DBH_LO
la['dbh_high'] = 2.54 * la.DBH_HI
la.drop(['DBH_LO', 'DBH_HI'], axis=1, inplace=True)

# Change date fields to dateTime type
la['created'] = pd.to_datetime(la.CREATED)
la['updated'] = pd.to_datetime(la.UPDATED)
la.drop(['CREATED', 'UPDATED'], axis=1, inplace=True)


 We will first use allometric equations from :

 McPherson, E. Gregory; van Doorn, Natalie S.; Peper, Paula J. 2016. Urban tree database.
 Fort Collins, CO: Forest Service Research Data Archive. Updated 21 January 2020.
 https://doi.org/10.2737/RDS-2016-0005

 'Apps min' and 'Apps max' give the input range (cm) that the authors feel 
  that the equations are reliable
 'InlEmp' and 'SoCalC' are Climate zones where the eqs are different.
  SoCalC reference city is Santa Monica, InlEmp is Claremont,
  see Table 1, p16 for further Climate zone details.  
  
  After reading the equations and coefficients, we will get rid of trees that only occur a few times, and trees that we o not have equations for.

In [15]:
# The equations
def mcpherson_eqs():
    '''returns dict of equations from table 3 (p24) of McPherson 2020
    functions use np so as to be vectorized'''

    eq_dict = {'lin'        : (lambda a, b, c, d, e, x, mse: a + b * (x)), 
                'quad'      : (lambda a, b, c, d, e, x, mse: a + b * x + c * x**2),
                'cub'      : (lambda a, b, c, d, e, x, mse: a + b * x + c * x**2 + d * x**3),
                'quart'     : (lambda a, b, c, d, e, x, mse:a + b * x + c *x**2 + d * x**3 + e * x**4), 
                'loglogw1' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1) + (mse/2)))),
                'loglogw2' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (np.sqrt(x) + (mse/2)))),
                'loglogw3' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (x) + (mse/2))),
                'loglogw4' : (lambda a, b, c, d, e, x, mse: np.exp(a + b * np.log(np.log(x + 1)) + (x**2) + (mse/2))),
                'expow1'    : (lambda a, b, c, d, e, x, mse: np.exp(a+ b * (x) + (mse/2))),
                'expow2'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + np.sqrt(x) + (mse/2))),
                'expow3'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + (x) + (mse/2))),
                'expow4'    : (lambda a, b, c, d, e, x, mse: np.exp(a + b * (x) + (x**2) + (mse/2)))}

    return(eq_dict)

eq_dict = mcpherson_eqs()

# The cooeficients
coef_df = pd.read_csv('TS6_Growth_coefficients.csvx',
usecols=['Region', 'Scientific Name', 'Independent variable', 'Predicts component ', 'EqName', 'Units of predicted components',
'EqName', 'a', 'b', 'c', 'd', 'e', 'Apps min', 'Apps max'])

# Find all the trees with over 100 occurances in the dataset
trees = la.Name_matched.value_counts()
trees = list(trees.where(trees > 100).dropna().index)

# drop trees we do not have equations for
trees = [s for s in trees if s in coef_df['Scientific Name'].unique()]
la = la.loc[la.Name_matched.isin(trees)]

la.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 664155 entries, 0 to 1089845
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ID            664155 non-null  int64         
 1   LATITUDE      664155 non-null  float64       
 2   LONGITUDE     664155 non-null  float64       
 3   SOURCE        664155 non-null  object        
 4   Name_matched  664155 non-null  object        
 5   Zone          663384 non-null  float64       
 6   dbh_low       664155 non-null  float64       
 7   dbh_high      664155 non-null  float64       
 8   created       28472 non-null   datetime64[ns]
 9   updated       28472 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(5), int64(1), object(2)
memory usage: 55.7+ MB
