In [None]:
import geopandas as gpd
from geopy.distance import great_circle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from shapely.geometry import MultiPoint, Point
from sklearn.cluster import DBSCAN
import warnings

# Set default settings.
%matplotlib inline
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [None]:
# Read in data on street trees.
df = pd.read_csv('https://data.cityofnewyork.us/api/views/uvpi-gqnh/rows.csv?accessType=DOWNLOAD')

# Read in geographic data on New York City.
nyc = gpd.read_file('./data/data_raw/nyc_boroughs/nyc_geo.shp')
nyc_streets = gpd.read_file('./data/data_raw/nyc_streets/geo_export_f311eef5-5969-43eb-a781-39fa54d09b70.shp')

In [None]:
df.head(1)

In [None]:
# Lazy EDA.
prfl = ProfileReport(df.loc[:, 'tree_id':'bbl'], title='Fast EDA', explorative=True)
prfl.to_notebook_iframe()

In [None]:
prfl.to_file('eda_trees_report.html')

# Inspecting specific columns

### Tree diameter

In [None]:
# Inspect data on tree diameters.
df['tree_dbh'].describe()

In [None]:
df['tree_dbh'] \
    .plot(bins=450, figsize=(10, 6), kind='hist', title='Diameter at breast height of tree', xlim=(0, 100)) \
    .set_xlabel('Diameter')

### Tree species

In [None]:
df['spc_latin'].describe()

In [None]:
df['spc_latin'].isna().any()

In [None]:
len(df.loc[df['spc_latin'].isna()])

In [None]:
len(df.loc[df['health'].isna()])

In [None]:
df.loc[df['spc_latin'].isna()]

In [None]:
df.loc[(df['spc_latin'].isna()) & (~df['health'].isna())]

In [None]:
# Finding: Tree species where the value is missing are mostly dead or stump trees with few (n = 5) exceptions.

# Inspecting missing values

In [None]:
# Replace NaN values in features with entries signalling this tree is either dead or a stump.
df['steward'].fillna('Dead|Stump', inplace=True)
df['guards'].fillna('Dead|Stump', inplace=True)
df['sidewalk'].fillna('Dead|Stump', inplace=True)

# In 'steward', replace spectrum of answers to yes (= alive) or no (= dead/stump).
df['steward'].replace(['1or2', '3or4', '4orMore', 'None'], 'Alive', inplace=True)

# Replace NaN values in the target with entries signalling this tree is either dead or a stump.
df['health'].fillna('Dead|Stump', inplace=True)

# Geographic analysis

In [None]:
# Bring geographic data on boroughs and street trees in NYC into a new format.
trees = gpd.GeoDataFrame(df, geometry=[Point(coordinates) for coordinates in zip(df['longitude'], df['latitude'])])
trees.crs = {'init': 'epsg:4326', 'no_defs': True}
nyc = nyc.to_crs(epsg=4326)
nyc.crs = {'init': 'epsg:4326', 'no_defs': True}

In [None]:
# Prepare geographic data on trees and boroughs in NYC.
trees_grouped = trees.groupby('borough').count()['tree_id']
trees_per_borough = nyc.merge(trees_grouped, left_on='bname', right_index=True)
tree_counts_borough = trees_per_borough[['bname', 'tree_id']].sort_values('tree_id')

# Question: Which borough of New York has the most streets trees?

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(15,5))

fig.suptitle('Count of street trees per borough in New York', fontsize=18)

fig.subplots_adjust(wspace=0.3)

# Plot chloropleth of count of street trees per NYC borough.
fig1 = trees_per_borough.plot(ax=axs[0],
                              column='tree_id',
                              alpha=0.7,
                              cmap='Greens',
                              edgecolor='k',
                              linewidth=0.3)

axs[0].axis('off')

# Add names of NYC Boroughs to the center of each polygon.
for borough in trees_per_borough.itertuples(index=True, name='bname'):
    axs[0].text(borough.geometry.centroid.x,
                borough.geometry.centroid.y,
                borough.bname,
                fontsize=13,
                bbox={
                  'boxstyle': 'square, pad=0.3',
                  'edgecolor': 'k',
                  'facecolor': 'white'
                  })

# Create a horizontal barchart for the number of trees per NYC borough.
fig2 = tree_counts_borough['tree_id'].plot(ax=axs[1],
                                           kind='barh',
                                           color='green',
                                           width=0.6,
                                           edgecolor='k',
                                           linewidth=0.6)
fig2.set_yticklabels(tree_counts_borough['bname'])
fig2.spines['left'].set_linewidth(3)
fig2.tick_params(axis='both', which='major', labelsize=13)

# Question: Which borough has the most diverse street tree flora?

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(15,5))

fig.suptitle('Count of street tree species per borough in New York', fontsize=18)

fig.subplots_adjust(wspace=0.3)

# Determine the number of street tree species per borough.
trees_species_grouped = trees.groupby(['borough', 'spc_common']) \
                             .count()[['tree_id']] \
                             .sort_values(['borough', 'tree_id'], 
                                          ascending=False)

tree_species_count_per_borough = pd.Series([len(trees_species_grouped.loc['Bronx']),
                                            len(trees_species_grouped.loc['Brooklyn']),
                                            len(trees_species_grouped.loc['Manhattan']),
                                            len(trees_species_grouped.loc['Staten Island']),
                                            len(trees_species_grouped.loc['Queens'])],
                                          index=['Bronx', 'Brooklyn', 'Manhattan', 'Staten Island', 'Queens']) \
                                    .sort_values()

tree_species_count_per_borough.name = 'species_count'

trees_species_grouped_per_borough = nyc.merge(tree_species_count_per_borough,
                                              left_on='bname',
                                              right_index=True)

# Plot chloropleth of count of street tree species of each NYC borough.
fig3 = trees_species_grouped_per_borough.plot(ax=axs[0],
                                              column='species_count',
                                              alpha=0.7,
                                              cmap='summer_r',
                                              edgecolor='k',
                                              linewidth=0.3)
axs[0].axis('off')
# Add names of NYC Boroughs to the center of each polygon.
for borough in trees_per_borough.itertuples(index=True, name='bname'):
    axs[0].text(borough.geometry.centroid.x,
                borough.geometry.centroid.y,
                borough.bname,
                fontsize=13,
                bbox={
                  'boxstyle': 'square, pad=0.3',
                  'edgecolor': 'k',
                  'facecolor': 'white'
                  })

# Create a horizontal barchart for the number of street tree species per borough in NYC.
fig4 = tree_species_count_per_borough.plot(ax=axs[1],
                                           kind='barh',
                                           color='lightskyblue',
                                           width=0.6,
                                           edgecolor='k',
                                           linewidth=0.6)

# Set other subplot settings.
fig4.set_yticklabels(tree_species_count_per_borough.index, fontdict={'fontsize': 13}, rotation=360)
fig4.spines['left'].set_linewidth(3)
fig4.tick_params(axis='both', which='major', labelsize=13)

# Question: In which condition are most of New York's street trees?

In [None]:
# Find health condition of street trees per borough.
trees_health_count = trees.groupby('health').count()['tree_id'].sort_values(ascending=False)

# Find unhealthy street trees per borough.
trees_grouped_unhealthy = trees.loc[(trees['health']=='Fair') |
                                    (trees['health']=='Dead|Stump') |
                                    (trees['health']=='Poor')] \
                               .groupby('borough') \
                               .count()['tree_id'] \
                               .sort_values()

fig, axs = plt.subplots(ncols=2, figsize=(15,5))

fig.subplots_adjust(wspace=0.2)

# Plot count of streets in their respective health condition.
fig5 = trees_health_count.plot(ax=axs[0],
                               kind='bar',
                               color='navy', 
                               width=0.6,
                               edgecolor='k',
                               linewidth=0.6)

# Set other plot settings.
fig5.set_title('Health condition of street trees in New York', fontsize=18, pad=20)
fig5.set_xticklabels(trees_health_count.index, fontdict={'fontsize': 13}, rotation=360)
fig5.set_xlabel('')
fig5.set_ylabel('Number of trees', fontdict={'fontsize': 13}, labelpad=20)
fig5.spines['bottom'].set_linewidth(3)
fig5.tick_params(axis='both', which='major', labelsize=13)

# Visualize count of trees in an unhealthy condition per borough.
fig6 = trees_grouped_unhealthy.plot(ax=axs[1],
                                    kind='bar',
                                    color='grey', 
                                    width=0.6,
                                    edgecolor='k',
                                    linewidth=0.6)

# Set other plot settings.
fig6.set_title('Count of unhealthy street trees per borough', fontsize=18, pad=20)
fig6.set_xticklabels(trees_grouped_unhealthy.index, fontdict={'fontsize': 13}, rotation=360)
fig6.set_xlabel('')
fig6.set_ylabel('')
fig6.spines['bottom'].set_linewidth(3)
fig6.tick_params(axis='both', which='major', labelsize=13)

# Question: Which street trees might need to be removed?

In [None]:
# Filter for dead / stump trees.
dead_trees = trees.loc[trees['health'] == 'Dead|Stump']
dead_trees_locations = dead_trees[['latitude', 'longitude']].values

# Find clusters of dead / stump street trees in New York.
distance_earth_radius = 0.5 / 6371.0
db = DBSCAN(eps=distance_earth_radius,
            min_samples=1,
            algorithm='ball_tree',
            metric='haversine',
            n_jobs=-1).fit(np.radians(dead_trees_locations))
n_clusters = len(set(db.labels_))
clusters = pd.Series([dead_trees_locations[db.labels_ == n] for n in range(n_clusters)])

def get_centermost_point(cluster):
    """
    Finds the centroid of a cluster of points.
    
    Method adapted from author: https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/.
    
    :param cluster: pandas.core.series.Series: Pairs of geographic coordinates in WGS 84.

    :return: pandas.core.series.Series
    """
    
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    centermost_point_coords = tuple(centermost_point)
    
    return centermost_point_coords

clusters_centers = clusters.map(get_centermost_point)

# Plot clusters.
fig, ax = plt.subplots(figsize=(10, 7))
fig.canvas.header_visible = False
ax.axis('off')
ax.set_title('{} dead / stump street tree clusters in New York'.format(n_clusters),
              fontdict={'fontsize': 25})

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
colorspectrum = [plt.cm.tab20(each) for each in np.linspace(0, 1, n_clusters)]
for k, col in zip(set(db.labels_), colorspectrum):
    if k == -1:
        col = [0, 0, 0, 1]

    class_member_mask = (db.labels_ == k)

    xy = dead_trees_locations[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 1], xy[:, 0], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=2)

    xy = dead_trees_locations[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 1], xy[:, 0], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=1)

# Plot boroughs of NYC for context as well.
nyc.plot(alpha=0.2, ax=ax, color='grey', edgecolor='k')
for borough in nyc.itertuples(index=True, name='bname'):
    plt.text(borough.geometry.centroid.x,
             borough.geometry.centroid.y,
             borough.bname, 
             bbox=dict(boxstyle='square, pad=0.3',
                       edgecolor='k',
                       facecolor='white'))

# Prepare dataset for next step

In [None]:
# Remove irrelevant columns.
df.drop(['block_id',
         'created_at',
         'tree_dbh',
         'stump_diam',
         'curb_loc',
         'spc_latin',
         'spc_common',
         'user_type',
         'problems',
         'address',
         'postcode',
         'zip_city',
         'community board',
         'borocode',
         'borough',
         'cncldist',
         'st_assem',
         'st_senate',
         'nta',
         'nta_name',
         'boro_ct',
         'state',
         'x_sp',
         'y_sp',
         'council district',
         'census tract',
         'bin',
         'bbl'],
         axis=1,
         inplace=True)

In [None]:
df.to_csv('./data/data_eda/data.csv.gz', compression='gzip', index=False)