In [1]:
# Import third-party packages.
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from shapely.geometry import Point, Polygon

%matplotlib inline

  import pandas.util.testing as tm


In [54]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [None]:
# Adjust matrix display settings.
# pd.options.display.max_rows = 1000

In [7]:
# Read in data on trees.
df_1995 = pd.read_csv('./data/nyc_trees/nyc_tree_census_1995.csv.gz', compression='gzip')
df_2005 = pd.read_csv('./data/nyc_trees/nyc_tree_census_2005.csv.gz', compression='gzip')
df_2015 = pd.read_csv('./data/nyc_trees/nyc_tree_census_2015.csv.gz', compression='gzip')
df_1995.drop('Unnamed: 0', axis=1, inplace=True)
df_2005.drop('Unnamed: 0', axis=1, inplace=True)
df_2015.drop('Unnamed: 0', axis=1, inplace=True)

# Read in geographic data on New York City.
nyc = gpd.read_file('./data/nyc/nyc_geo.shp')

In [None]:
sns.set(style='whitegrid')
sns.mpl.rc('figure',
           figsize=(20,20))
fig, ax = plt.subplots()
nyc.plot(ax=ax)

In [31]:
test = df_2015.copy()

In [33]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683788 entries, 0 to 683787
Data columns (total 45 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   tree_id           683788 non-null  int64  
 1   block_id          683788 non-null  int64  
 2   created_at        683788 non-null  object 
 3   tree_dbh          683788 non-null  int64  
 4   stump_diam        683788 non-null  int64  
 5   curb_loc          683788 non-null  object 
 6   status            683788 non-null  object 
 7   health            683788 non-null  object 
 8   spc_latin         652169 non-null  object 
 9   spc_common        652169 non-null  object 
 10  steward           652173 non-null  object 
 11  guards            652172 non-null  object 
 12  sidewalk          652172 non-null  object 
 13  user_type         683788 non-null  object 
 14  problems          652124 non-null  object 
 15  root_stone        683788 non-null  object 
 16  root_grate        68

In [32]:
# Replace NaN values with entries signalling this tree is either dead or a stump.
test['health'].fillna('Dead|Stump', inplace=True)
test['steward'].fillna('Dead|Stump', inplace=True)
test['guards'].fillna('Dead|Stump', inplace=True)
test['sidewalk'].fillna('Dead|Stump', inplace=True)

In [29]:
test[(test.status == "Alive") & (test.health == "Poor")]

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,boro_ct,state,latitude,longitude,x_sp,y_sp,council district,census tract,bin,bbl
11,203726,302371,09/05/2015,8,0,OnCurb,Alive,Poor,Platanus x acerifolia,London planetree,...,4010500,New York,40.781735,-73.912020,1.008615e+06,224096.2740,22.0,105.0,4019059.0,4.008710e+09
73,209441,503960,09/08/2015,5,0,OnCurb,Alive,Poor,Ulmus americana,American elm,...,2021502,New York,40.846376,-73.917632,1.007039e+06,247645.6780,14.0,21502.0,2008808.0,2.028760e+09
85,208832,220174,09/08/2015,3,0,OnCurb,Alive,Poor,Platanus x acerifolia,London planetree,...,3019300,New York,40.692278,-73.961423,9.949480e+05,191494.4858,35.0,193.0,3054898.0,3.019090e+09
88,179477,223106,08/27/2015,3,0,OnCurb,Alive,Poor,Platanus x acerifolia,London planetree,...,3015300,New York,40.668231,-73.980730,9.895956e+05,182731.6671,39.0,153.0,3021747.0,3.009950e+09
140,161242,339804,08/20/2015,3,0,OnCurb,Alive,Poor,Gleditsia triacanthos var. inermis,honeylocust,...,4097203,New York,40.596836,-73.772454,1.047442e+06,156802.1034,31.0,97203.0,4302129.0,4.159550e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683668,178647,105528,08/26/2015,6,0,OnCurb,Alive,Poor,Ginkgo biloba,ginkgo,...,1007200,New York,40.745513,-73.980250,9.897226e+05,210887.6111,2.0,72.0,1018474.0,1.008888e+09
683675,205784,106342,09/06/2015,12,0,OnCurb,Alive,Poor,Ginkgo biloba,ginkgo,...,1015900,New York,40.780235,-73.983527,9.888122e+05,223537.8197,6.0,159.0,1030533.0,1.011640e+09
683685,201872,338313,09/04/2015,4,0,OnCurb,Alive,Poor,Ginkgo biloba,ginkgo,...,4093800,New York,40.583521,-73.822990,1.033417e+06,151918.5643,32.0,94201.0,4533171.0,4.161570e+09
683726,186629,505216,08/29/2015,3,0,OnCurb,Alive,Poor,Prunus,cherry,...,2039500,New York,40.848699,-73.896057,1.013007e+06,248498.5792,15.0,395.0,2115722.0,2.030440e+09


In [39]:
features = test[['tree_id',
                 'steward',
                 'guards',
                 'root_stone',
                 'root_grate',
                 'root_other',
                 'trunk_wire',
                 'trnk_light',
                 'trnk_other',
                 'brch_light',
                 'brch_shoe',
                 'brch_other',
                 'health',
                 'status']]
df_enc = pd.get_dummies(features)

In [45]:
x = df_enc[list(df_enc.columns)[1:33]]
y = df_enc[list(df_enc.columns)[33:]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

In [48]:
rdf_clf = RandomForestClassifier(random_state=0)
rdf_clf.fit(x_train, y_train)
y_pred = rdf_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.9795295339212331


In [49]:
pd.Series(rdf_clf.feature_importances_, index=x.columns).sort_values(ascending=False)

steward_Dead|Stump    0.276052
health_Dead|Stump     0.248792
guards_Dead|Stump     0.242644
steward_None          0.053296
health_Good           0.052016
guards_None           0.048147
steward_1or2          0.030552
health_Fair           0.019858
guards_Helpful        0.011755
guards_Harmful        0.004851
steward_3or4          0.002938
root_stone_No         0.001861
health_Poor           0.001749
guards_Unsure         0.001715
root_stone_Yes        0.001178
brch_light_No         0.000814
brch_other_Yes        0.000734
trnk_other_Yes        0.000368
brch_light_Yes        0.000349
brch_other_No         0.000170
root_other_Yes        0.000067
trnk_other_No         0.000048
root_other_No         0.000045
trunk_wire_Yes        0.000000
root_grate_No         0.000000
root_grate_Yes        0.000000
brch_shoe_No          0.000000
brch_shoe_Yes         0.000000
trnk_light_Yes        0.000000
steward_4orMore       0.000000
trunk_wire_No         0.000000
trnk_light_No         0.000000
dtype: f

In [50]:
accuracy_scores = cross_val_score(rdf_clf, x_train, y_train)

In [51]:
accuracy_scores

array([0.97972092, 0.97984279, 0.97917225, 0.97985473, 0.9795013 ])

In [55]:
category_names = list(y.columns)

for col in range(len(category_names)):
        result = classification_report(y_test.iloc[:,col], y_pred[:,col])
        print("Report on", category_names[col], ":")
        print(result)
        print("F1-score of positive classes:", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average=None))
        print("F1-score (micro):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='micro'))
        print("F1-score (macro):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='macro'))
        print("F1-score (weighted):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='weighted'))
        print("Accuracy score:", accuracy_score(y_test.iloc[:,col], y_pred[:,col]))

Report on status_Alive :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12734
           1       1.00      1.00      1.00    260782

    accuracy                           1.00    273516
   macro avg       1.00      1.00      1.00    273516
weighted avg       1.00      1.00      1.00    273516

F1-score of positive classes: [1. 1.]
F1-score (micro): 1.0
F1-score (macro): 1.0
F1-score (weighted): 1.0
Accuracy score: 1.0


  _warn_prf(average, modifier, msg_start, len(result))


Report on status_Dead :
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    267917
           1       0.00      0.00      0.00      5599

    accuracy                           0.98    273516
   macro avg       0.49      0.50      0.49    273516
weighted avg       0.96      0.98      0.97    273516

F1-score of positive classes: [0.98965892 0.        ]
F1-score (micro): 0.9795295339212331
F1-score (macro): 0.49482946181706694
F1-score (weighted): 0.9694001442083324
Accuracy score: 0.9795295339212331
Report on status_Stump :
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    266381
           1       0.56      1.00      0.72      7135

    accuracy                           0.98    273516
   macro avg       0.78      0.99      0.85    273516
weighted avg       0.99      0.98      0.98    273516

F1-score of positive classes: [0.989379   0.71820424]
F1-score (micro): 0.9795295339212331
F1

In [66]:
parameter_grid = dict(n_estimators=list(range(5000, 6001, 1000)),
                      criterion=['gini','entropy'],
                      max_features=list(range(1, round(np.sqrt(len(x.columns))).astype(int), 6)))
gcv_rdf_clf = GridSearchCV(estimator=rdf_clf, param_grid=parameter_grid, cv=5, verbose=1)

In [None]:
accuracy_scores = cross_val_score(gcv_rdf_clf, x_train, y_train)
accuracy_scores

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
