In [None]:
# nonlinear classifier - the RandomForestClassifier. It requires little hyperparameter tuning.

# No need for imputing the missing data --> data description.

In [None]:
# Import third-party packages.
import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from shapely.geometry import Point, Polygon

%matplotlib inline

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [None]:
# Adjust matrix display settings.
# pd.options.display.max_rows = 1000

In [None]:
# Read in data on trees.
df_1995 = pd.read_csv('./data/nyc_trees/nyc_tree_census_1995.csv.gz', compression='gzip')
df_2005 = pd.read_csv('./data/nyc_trees/nyc_tree_census_2005.csv.gz', compression='gzip')
df_2015 = pd.read_csv('./data/nyc_trees/nyc_tree_census_2015.csv.gz', compression='gzip')
df_1995.drop('Unnamed: 0', axis=1, inplace=True)
df_2005.drop('Unnamed: 0', axis=1, inplace=True)
df_2015.drop('Unnamed: 0', axis=1, inplace=True)

# Read in geographic data on New York City.
nyc = gpd.read_file('./data/nyc/nyc_geo.shp')

In [None]:
sns.set(style='whitegrid')
sns.mpl.rc('figure',
           figsize=(20,20))
fig, ax = plt.subplots()
nyc.plot(ax=ax)

In [None]:
test = df_2015.copy()

In [None]:
test.info()

In [None]:
# Replace NaN values with entries signalling this tree is either dead or a stump.
test['health'].fillna('Dead|Stump', inplace=True)
test['steward'].fillna('Dead|Stump', inplace=True)
test['guards'].fillna('Dead|Stump', inplace=True)
test['sidewalk'].fillna('Dead|Stump', inplace=True)

In [None]:
test[(test.status == "Alive") & (test.health == "Poor")]

In [None]:
features = test[['tree_id',
                 'steward',
                 'guards',
                 'root_stone',
                 'root_grate',
                 'root_other',
                 'trunk_wire',
                 'trnk_light',
                 'trnk_other',
                 'brch_light',
                 'brch_shoe',
                 'brch_other',
                 'health',
                 'status']]

# One-hot encoding the categorical features.
df_enc = pd.get_dummies(features)

In [None]:
x = df_enc[list(df_enc.columns)[1:30]]
y = df_enc[list(df_enc.columns)[30:]]

# Split into train, test, and validation datasets.
x_train_vali, x_test, y_train_vali, y_test = train_test_split(x, y, random_state=0)
x_train, x_vali, y_train, y_vali = train_test_split(x_train_vali, y_train_vali, random_state=0)

In [None]:
# Training and feature selection.
rdf_clf = RFE(RandomForestClassifier(random_state=0))
rdf_clf.fit(x_train, y_train)
y_pred = rdf_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

In [None]:
pd.Series(rdf_clf.feature_importances_, index=x.columns).sort_values(ascending=False)

In [None]:
test['steward'].replace(['1or2', '3or4', '4orMore', 'None'], 'Alive', inplace=True)
test['steward'].fillna('Dead|Stump', inplace=True)

In [None]:
test['status'].replace(['Stump', 'Dead'], 'Dead|Stump', inplace=True)

In [None]:
test['health'].fillna('Dead|Stump', inplace=True)

In [None]:
test['guards'].fillna('Dead|Stump', inplace=True)
test['sidewalk'].fillna('Dead|Stump', inplace=True)

In [None]:
test.health.value_counts()

In [None]:
x_test.loc[x_test["health_Dead|Stump"] == 1]

In [None]:
x_test.iloc[-2]

In [None]:
test.iloc[241930]

In [None]:
probe = x_test.iloc[-2]

In [None]:
rdf_clf.predict([probe])

In [None]:
y.shape

In [None]:
accuracy_scores = cross_val_score(rdf_clf, x_train, y_train)

In [None]:
accuracy_scores

In [None]:
category_names = list(y.columns)

for col in range(len(category_names)):
        result = classification_report(y_test.iloc[:,col], y_pred[:,col])
        print("Report on", category_names[col], ":")
        print(result)
        print("F1-score of positive classes:", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average=None))
        print("F1-score (micro):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='micro'))
        print("F1-score (macro):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='macro'))
        print("F1-score (weighted):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='weighted'))
        print("Accuracy score:", accuracy_score(y_test.iloc[:,col], y_pred[:,col]))

In [None]:
parameter_grid = dict(n_estimators=list(range(5000, 6001, 1000)),
                      criterion=['gini','entropy'],
                      max_features=list(range(1, round(np.sqrt(len(x.columns))).astype(int), 6)))
gcv_rdf_clf = GridSearchCV(estimator=rdf_clf, param_grid=parameter_grid, cv=5, verbose=1)

In [None]:
accuracy_scores = cross_val_score(gcv_rdf_clf, x_train, y_train)
accuracy_scores

In [None]:
features = test[['tree_id',
                 'steward',
                 'guards',
                 'root_stone',
                 'root_grate',
                 'root_other',
                 'trunk_wire',
                 'trnk_light',
                 'trnk_other',
                 'brch_light',
                 'brch_shoe',
                 'brch_other',
                 'health']]
df_enc = pd.get_dummies(features)

# how to fine graine features to predict the spectrum of health perception?

In [None]:
x = df_enc[list(df_enc.columns)[1:26]]
y = df_enc[list(df_enc.columns)[26:]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

In [None]:
rdf_clf = RandomForestClassifier()
rdf_clf.fit(x_train, y_train)
y_pred = rdf_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

In [None]:
category_names = list(y.columns)

for col in range(len(category_names)):
        result = classification_report(y_test.iloc[:,col], y_pred[:,col])
        print("Report on", category_names[col], ":")
        print(result)
        print("F1-score of positive classes:", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average=None))
        print("F1-score (micro):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='micro'))
        print("F1-score (macro):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='macro'))
        print("F1-score (weighted):", f1_score(y_test.iloc[:,col], y_pred[:,col], labels=np.unique(y_pred), average='weighted'))
        print("Accuracy score:", accuracy_score(y_test.iloc[:,col], y_pred[:,col]))