In [None]:
# Multiclass classification

# nonlinear classifier - the RandomForestClassifier. It requires little hyperparameter tuning.

# No need for imputing the missing data --> data description.

# Problem: You can't see every tree.. predict the health of the tree.

In [1]:
# Import third-party packages.
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

%matplotlib inline

In [2]:
# Read in data on trees.
df_1995 = pd.read_csv('./data/nyc_trees/nyc_tree_census_1995.csv.gz', compression='gzip')
df_2005 = pd.read_csv('./data/nyc_trees/nyc_tree_census_2005.csv.gz', compression='gzip')
df_2015 = pd.read_csv('./data/nyc_trees/nyc_tree_census_2015.csv.gz', compression='gzip')
df_1995.drop('Unnamed: 0', axis=1, inplace=True)
df_2005.drop('Unnamed: 0', axis=1, inplace=True)
df_2015.drop('Unnamed: 0', axis=1, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
test = df_2015.copy()

In [4]:
df_sel = test[['tree_id',
               'steward',
               'guards',
               'root_stone',
               'root_grate',
               'root_other',
               'trunk_wire',
               'trnk_light',
               'trnk_other',
               'brch_light',
               'brch_shoe',
               'brch_other',
               'health']]

# Replace NaN values in features with entries signalling this tree is either dead or a stump.
df_sel['steward'].fillna('Dead|Stump', inplace=True)
df_sel['guards'].fillna('Dead|Stump', inplace=True)

# In 'steward', replace spectrum of answers to yes (= alive) or no (= dead/stump).
df_sel['steward'].replace(['1or2', '3or4', '4orMore', 'None'], 'Alive', inplace=True)

# Replace NaN values in the target with entries signalling this tree is either dead or a stump.
df_sel['health'].fillna('Dead|Stump', inplace=True)

# One-hot encoding the categorical features and the target.
df_sel_enc = pd.get_dummies(df_sel)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [5]:
x = df_sel_enc[list(df_sel_enc.columns)[1:26]]
y = df_sel_enc[list(df_sel_enc.columns)[26:]]

# Split into train, test, and validation datasets.
x_train_vali, x_test, y_train_vali, y_test = train_test_split(x, y, random_state=0)
x_train, x_vali, y_train, y_vali = train_test_split(x_train_vali, y_train_vali, random_state=0)

In [6]:
rdf_clf = RandomForestClassifier()
rdf_clf.fit(x_train, y_train)
y_pred = rdf_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.8152058825249932


In [20]:
# Training & pruning with model-based feature selection.
rdf_clf = SelectFromModel(RandomForestClassifier(random_state=0))
rdf_clf.fit(x_train, y_train)
x_train_fs = rdf_clf.transform(x_train)
x_test_fs = rdf_clf.transform(x_test)
rdf_clf_n = RandomForestClassifier(random_state=0).fit(x_train_fs, y_train)
accuracy = rdf_clf_n.score(x_test_fs, y_test)
print("Accuracy score:", accuracy)

Accuracy score: 0.819248071039562


In [22]:
print(classification_report(y_test, y_pred, target_names=y.columns))
print("F1-score of positive classes:", f1_score(y_test, y_pred, labels=np.unique(y_pred), average=None))
print("F1-score (micro):", f1_score(y_test, y_pred, labels=np.unique(y_pred), average='micro'))
print("F1-score (macro):", f1_score(y_test, y_pred, labels=np.unique(y_pred), average='macro'))
print("F1-score (weighted):", f1_score(y_test, y_pred, labels=np.unique(y_pred), average='weighted'))
print("Old accuracy score:", accuracy_score(y_test, y_pred))

                   precision    recall  f1-score   support

health_Dead|Stump       1.00      1.00      1.00      8000
      health_Fair       0.34      0.00      0.00     24284
      health_Good       0.82      0.99      0.90    132049
      health_Poor       0.00      0.00      0.00      6614

        micro avg       0.82      0.82      0.82    170947
        macro avg       0.54      0.50      0.47    170947
     weighted avg       0.73      0.82      0.74    170947
      samples avg       0.82      0.82      0.82    170947

F1-score of positive classes: [0.9999375  0.00164325]
F1-score (micro): 0.39756079422919616
F1-score (macro): 0.5007903732213483
F1-score (weighted): 0.24902133131668303
Old accuracy score: 0.8152058825249932


In [None]:
rdf_clf_n.predict([probe])

In [None]:
parameter_grid = dict(n_estimators=list(range(5000, 6001, 1000)),
                      criterion=['gini','entropy'],
                      max_features=list(range(1, round(np.sqrt(len(x.columns))).astype(int), 6)))
gcv_rdf_clf = GridSearchCV(estimator=rdf_clf, param_grid=parameter_grid, cv=5, verbose=1)

In [None]:
accuracy_scores = cross_val_score(gcv_rdf_clf, x_train, y_train)
accuracy_scores