TODO:
    - changes over time for trees in the same area as a feature?
    - add here: test run with rdf for feature importance

In [None]:
# Import third-party packages.
import datetime
import geopandas as gpd
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree

%matplotlib inline

# Change display settings for dataframes.
pd.set_option('display.max_columns', None)

In [None]:
# Read in data on street trees.
df_1995 = pd.read_csv('./data/nyc_trees/nyc_tree_census_1995.csv.gz', compression='gzip')
df_2005 = pd.read_csv('./data/nyc_trees/nyc_tree_census_2005.csv.gz', compression='gzip')
df_2015 = pd.read_csv('./data/nyc_trees/nyc_tree_census_2015.csv.gz', compression='gzip')
df_1995.drop('Unnamed: 0', axis=1, inplace=True)
df_2005.drop('Unnamed: 0', axis=1, inplace=True)
df_2015.drop('Unnamed: 0', axis=1, inplace=True)

# Read in geographic data on New York City.
nyc = gpd.read_file('./data/nyc/nyc_geo.shp')

In [None]:
df = df_2015.copy()

## Imputing or aggregating existing features

In [None]:
# Replace NaN values in features with entries signalling this tree is either dead or a stump.
df['steward'].fillna('Dead|Stump', inplace=True)
df['guards'].fillna('Dead|Stump', inplace=True)

# In 'steward', replace spectrum of answers to yes (= alive) or no (= dead/stump).
df['steward'].replace(['1or2', '3or4', '4orMore', 'None'], 'Alive', inplace=True)

# Replace NaN values in the target with entries signalling this tree is either dead or a stump.
df['health'].fillna('Dead|Stump', inplace=True)

## Creating a new feature n_neighbors: The number of trees in a tree's proximity. 

In [None]:
# Turn geographic coordinates into Shapely objects.
trees = gpd.GeoDataFrame(df, geometry=[Point(coordinates) for coordinates in zip(df['longitude'], df['latitude'])])

# Define CRS as WGS-84, then switch to a metric CRS.
trees.crs = {'init': 'epsg:4326', 'no_defs': True}
trees = trees.to_crs(epsg=3857)
trees.crs = {'init': 'epsg:3857', 'no_defs': True}

# Create circles with a 20m radius around each tree location. But since the radius of a tree might just touch another circle,
# the maximum distance between such trees should be 20m and thus a radius of 10m each.
buffers = trees.buffer(10)

# Set the Shapely objects for the circle to be in the WGS-84 CRS.
buffers = buffers.to_crs(epsg=4326)
buffers.crs = {'init': 'epsg:4326', 'no_defs': True}

# Switch the CRS again back to WGS-84.
trees = trees.to_crs(epsg=4326)
trees.crs = {'init': 'epsg:4326', 'no_defs': True}

In [None]:
# Turn the GeoSeries for the circles into a dataframe for a later sjoin.
circles = gpd.GeoDataFrame(buffers, geometry=buffers)
circles.rename(columns={0:'circles'}, inplace=True)

# Spatial join to find neighboring trees.
neighbors = gpd.sjoin(trees, gpd.GeoDataFrame(circles), how='right', op='intersects')

# Drop irrelevant columns.
neighbors.drop('index_right', axis=1, inplace=True)
#neighbors.drop_duplicates(inplace=True)

In [None]:
# Count neighboring trees per tree.
n_neighbors = neighbors.groupby('tree_id').count()

# Add tree counts to data of each tree.
df_new = pd.merge(df, n_neighbors[['circles']], on='tree_id', right_index=True)
df_new.rename(columns={'circles': 'n_neighbors'}, inplace=True)

# Encode the feature on the number of neighboring trees.
df_new.loc[df_new['n_neighbors'] == 1, 'n_neighbors'] = 0.1
df_new.loc[((df_new['n_neighbors'] >= 2) & (df_new['n_neighbors'] <= 3)), 'n_neighbors'] = 0.2
df_new.loc[df_new['n_neighbors'] >= 4, 'n_neighbors'] = 0.3
df_new.loc[df_new['n_neighbors'] == 0.1, 'n_neighbors'] = "no neighbor"
df_new.loc[df_new['n_neighbors'] == 0.2, 'n_neighbors'] = "two or three neighbors"
df_new.loc[df_new['n_neighbors'] == 0.3, 'n_neighbors'] = "more than three neighbors"

## Feature selection

In [None]:
# Selecting features and the target ('health').
df_sel = df_new[['tree_id',
                 'steward',
                 'guards',
                 'root_stone',
                 'root_grate',
                 'root_other',
                 'trunk_wire',
                 'trnk_light',
                 'trnk_other',
                 'brch_light',
                 'brch_shoe',
                 'brch_other',
                 'n_neighbors',
                 'health']]

# One-hot encoding the categorical features and the target.
df_sel_enc = pd.get_dummies(df_sel)

In [None]:
# Search for correlations between features (and target) categories.
corrmat = df_sel_enc.corr()
plt.subplots(figsize=(25,20))
sns.heatmap(corrmat, annot=False)
plt.show()

In [None]:
# Conduct stratified splitting of the dataset.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

In [None]:
# Training & pruning with model-based feature selection.
rdf_clf_sm = SelectFromModel(RandomForestClassifier(random_state=0), threshold='median')
rdf_clf_sm.fit(x_train, y_train)
x_train_fs = rdf_clf_sm.transform(x_train)
x_test_fs = rdf_clf_sm.transform(x_test)

rdf_clf_n = RandomForestClassifier(random_state=0).fit(x_train_fs, y_train)
y_pred_fs = rdf_clf_n.predict(x_test_fs)
accuracy = rdf_clf_n.score(x_test_fs, y_test)
print("Accuracy score:", accuracy)

## Export feature / target dataframe as .csv file

In [None]:
df_sel_enc.to_csv('./data_preprocessed/features.csv', index=False)