In [1]:
# Import third-party packages.
import geopandas as gpd
from geopy.distance import great_circle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from shapely.geometry import MultiPoint, Point
from sklearn.cluster import DBSCAN

%matplotlib widget

# Change display settings for dataframes.
pd.set_option('display.max_columns', None)

  import pandas.util.testing as tm


In [2]:
# Read in data on street trees.
df_2015 = pd.read_csv('./data/data_raw/nyc_trees/nyc_tree_census_2015.csv.gz', compression='gzip')
df_2015.drop('Unnamed: 0', axis=1, inplace=True)
df = df_2015.copy()

# Read in geographic data on New York City.
nyc = gpd.read_file('./data/data_raw/nyc_boroughs/nyc_geo.shp')
nyc_streets = gpd.read_file('./data/data_raw/nyc_streets/geo_export_f311eef5-5969-43eb-a781-39fa54d09b70.shp')

# Bring geographic data on boroughs and street trees in NYC into a new format.
trees = gpd.GeoDataFrame(df, geometry=[Point(coordinates) for coordinates in zip(df['longitude'], df['latitude'])])
trees.crs = {'init': 'epsg:4326', 'no_defs': True}
nyc = nyc.to_crs(epsg=4326)
nyc.crs = {'init': 'epsg:4326', 'no_defs': True}

  return _prepare_from_string(" ".join(pjargs))


In [None]:
df.head(1)

In [None]:
# Lazy EDA.
prfl = ProfileReport(df, title="Fast EDA", explorative=True)
prfl.to_notebook_iframe()

many variables which might not have a large influence on a target

# Inspecting specific columns

### Tree diameter

In [None]:
# Inspect data on tree diameters.
df['tree_dbh'].describe()

In [None]:
df['tree_dbh'] \
    .plot(bins=450, kind='hist', title='Diameter at breast height of tree', xlim=(0,100)) \
    .set_xlabel('Diameter')

### Tree species

In [None]:
df['spc_latin'].describe()

In [None]:
df['spc_latin'].isna().any()

In [None]:
len(df.loc[df['spc_latin'].isna()])

In [None]:
len(df.loc[df['health'].isna()])

In [None]:
df.loc[df['spc_latin'].isna()]

In [None]:
df.loc[(df['spc_latin'].isna()) & (~df['health'].isna())]

In [None]:
# Finding: Tree species where the value is missing are mostly dead or stump trees with few (n = 5) exceptions.

# Inspecting missing values

In [3]:
# Replace NaN values in features with entries signalling this tree is either dead or a stump.
df['steward'].fillna('Dead|Stump', inplace=True)
df['guards'].fillna('Dead|Stump', inplace=True)
df['sidewalk'].fillna('Dead|Stump', inplace=True)

# In 'steward', replace spectrum of answers to yes (= alive) or no (= dead/stump).
df['steward'].replace(['1or2', '3or4', '4orMore', 'None'], 'Alive', inplace=True)

# Replace NaN values in the target with entries signalling this tree is either dead or a stump.
df['health'].fillna('Dead|Stump', inplace=True)

# Geographic analysis

In [None]:
# Prepare geographic data on trees and boroughs in NYC.
trees_grouped = trees.groupby('borough').count()['tree_id']
trees_per_borough = nyc.merge(trees_grouped, left_on='bname', right_index=True)
tree_counts_borough = trees_per_borough[['bname', 'tree_id']].sort_values('tree_id')

# Question: How many streets trees exist in New York?

In [None]:
ax = tree_counts_borough.plot(kind='barh', color='green')
ax.set_title('Number of street trees in New York per borough')
ax.set_yticklabels(tree_counts_borough['bname'])
ax.set_xlabel('Number of trees')
ax.get_legend().remove()

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
ax.axis('off')
ax.set_title('Trees per borough in New York',
             fontdict={'fontsize': 25})

# Plot trees per borough.
trees_per_borough.plot(column='tree_id', alpha=0.7, ax=ax, cmap='Greens', edgecolor='k')
for borough in trees_per_borough.itertuples(index=True, name='bname'):
    plt.text(borough.geometry.centroid.x,
             borough.geometry.centroid.y,
             borough.bname, 
             bbox=dict(boxstyle='square, pad=0.3',
                       edgecolor='k',
                       facecolor='white'))

# Plot all trees additionally.
trees.plot(ax=ax, alpha=0.2, color='k', markersize=0.1)

# Add a legend.
legend_bar = fig.colorbar(plt.cm.ScalarMappable(cmap='Greens'), fraction=0.03, pad=0.05)

# Question: Which street trees exist?

In [None]:
# Find the count of tree species per borough.
trees_species_grouped = trees.groupby(['borough', 'spc_common']).count()[['tree_id']].sort_values(['borough', 'tree_id'], ascending=False)
tree_species_count_per_borough = pd.Series([len(trees_species_grouped.loc['Bronx']),
                                            len(trees_species_grouped.loc['Brooklyn']),
                                            len(trees_species_grouped.loc['Manhattan']),
                                            len(trees_species_grouped.loc['Staten Island']),
                                            len(trees_species_grouped.loc['Queens'])],
                                          index=['Bronx', 'Brooklyn', 'Manhattan', 'Staten Island', 'Queens']) \
                                    .sort_values()
tree_species_count_per_borough.name = 'species_count'
trees_species_grouped_per_borough = nyc.merge(tree_species_count_per_borough, left_on='bname', right_index=True)

ax = tree_species_count_per_borough.plot(kind='barh')
ax.set_title('Diversity of street tree flora per borough in New York')
ax.set_yticklabels(trees_species_grouped_per_borough['bname'])
ax.set_xlabel('Number of species')

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
ax.axis('off')
ax.set_title('Diversity of street tree flora per borough in New York',
             fontdict={'fontsize': 25})

# Plot trees per borough.
trees_species_grouped_per_borough.plot(column='species_count', alpha=0.7, ax=ax, cmap='summer_r', edgecolor='k')
for borough in trees_per_borough.itertuples(index=True, name='bname'):
    plt.text(borough.geometry.centroid.x,
             borough.geometry.centroid.y,
             borough.bname, 
             bbox=dict(boxstyle='square, pad=0.3',
                       edgecolor='k',
                       facecolor='white'))

# Add a legend.
legend_bar = fig.colorbar(plt.cm.ScalarMappable(cmap='summer_r'), fraction=0.03, pad=0.05)

# Question: In which condition are New York's street trees?

In [None]:
# Find health condition of street trees per borough.
trees_health_count = trees.groupby('health').count()['tree_id'].sort_values()

ax = trees_health_count.plot(kind='barh', color='grey')
ax.set_title('Health condition of street trees in New York')
ax.set_yticklabels(trees_health_count.index)
ax.set_xlabel('Number of trees')
ax.set_ylabel('')

# Question: Which street trees might need to be removed?

In [None]:
# Find unhealthy street trees per borough.
trees_grouped_unhealthy = trees.loc[(trees['health']=='Fair') |
                                    (trees['health']=='Dead|Stump') |
                                    (trees['health']=='Poor')].groupby('borough').count()['tree_id']
trees_grouped_unhealthy = nyc.merge(trees_grouped_unhealthy, left_on='bname', right_index=True)
tree_counts_borough = trees_per_borough[['bname', 'tree_id']].sort_values('tree_id')

fig, ax = plt.subplots(figsize=(20, 20))
ax.axis('off')
ax.set_title('Unhealthy trees per borough in New York',
             fontdict={'fontsize': 25})

# Plot unhealthy trees per borough.
trees_grouped_unhealthy.plot(column='tree_id', alpha=0.7, ax=ax, cmap='Greys', edgecolor='k')
for borough in trees_per_borough.itertuples(index=True, name='bname'):
    plt.text(borough.geometry.centroid.x,
             borough.geometry.centroid.y,
             borough.bname, 
             bbox=dict(boxstyle='square, pad=0.3',
                       edgecolor='k',
                       facecolor='white'))

# Plot all unhealthy trees.
trees.loc[(trees['health']=='Fair') |
          (trees['health']=='Dead|Stump') |
          (trees['health']=='Poor')].plot(ax=ax, alpha=0.2, color='k', markersize=0.1)

# Add a legend.
legend_bar = fig.colorbar(plt.cm.ScalarMappable(cmap='Greys'), fraction=0.03, pad=0.05)

In [11]:
# Filter for dead / stump trees.
dead_trees = trees.loc[trees['health'] == 'Dead|Stump']
dead_trees_locations = dead_trees[['latitude', 'longitude']].values

# Find clusters of dead / stump street trees in New York.
distance_earth_radius = 0.5 / 6371.0
db = DBSCAN(eps=distance_earth_radius,
            min_samples=1,
            algorithm='ball_tree',
            metric='haversine',
            n_jobs=-1).fit(np.radians(dead_trees_locations))
n_clusters = len(set(db.labels_))
clusters = pd.Series([dead_trees_locations[db.labels_ == n] for n in range(n_clusters)])

def get_centermost_point(cluster):
    """
    Finds the centroid of a cluster of points.
    
    Method adapted from author mentioned in source[1].
    
    :param cluster: pandas.core.series.Series: Pairs of geographic coordinates in WGS 84.

    :return: pandas.core.series.Series
    """
    
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    centermost_point_coords = tuple(centermost_point)
    
    return centermost_point_coords

clusters_centers = clusters.map(get_centermost_point)

# Plot clusters.
fig, ax = plt.subplots(figsize=(10, 7))
fig.canvas.header_visible = False
ax.axis('off')
ax.set_title('{} dead / stump street tree clusters in New York'.format(n_clusters),
              fontdict={'fontsize': 25})

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
colorspectrum = [plt.cm.tab20(each) for each in np.linspace(0, 1, n_clusters)]
for k, col in zip(set(db.labels_), colorspectrum):
    if k == -1:
        col = [0, 0, 0, 1]

    class_member_mask = (db.labels_ == k)

    xy = dead_trees_locations[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 1], xy[:, 0], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=2)

    xy = dead_trees_locations[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 1], xy[:, 0], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=1)

# Plot boroughs of NYC for context as well.
nyc.plot(alpha=0.2, ax=ax, color='grey', edgecolor='k')
for borough in nyc.itertuples(index=True, name='bname'):
    plt.text(borough.geometry.centroid.x,
             borough.geometry.centroid.y,
             borough.bname, 
             bbox=dict(boxstyle='square, pad=0.3',
                       edgecolor='k',
                       facecolor='white'))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Prepare dataset for next step

In [4]:
# Remove irrelevant columns.
df.drop(['block_id',
         'created_at',
         'tree_dbh',
         'stump_diam',
         'curb_loc',
         'spc_latin',
         'spc_common',
         'user_type',
         'problems',
         'address',
         'postcode',
         'zip_city',
         'community board',
         'borocode',
         'borough',
         'cncldist',
         'st_assem',
         'st_senate',
         'nta',
         'nta_name',
         'boro_ct',
         'state',
         'x_sp',
         'y_sp',
         'council district',
         'census tract',
         'bin',
         'bbl'],
         axis=1,
         inplace=True)

In [5]:
df.to_csv('./data/data_eda/data.csv.gz', compression='gzip', index=False)