In [7]:
import pandas as pd
import csv
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter




In [8]:
data = pd.read_csv('data/parsed_data.csv')

# Prepare Data

In [9]:
# Encode tree species as their frequency count rather than one hot encoding, since there are 100s of speicies
# Loss of info, but it's a tradeoff
data.spc_latin = data.spc_latin.map(data.spc_latin.value_counts()) 

In [10]:
# Encode borough as number
borough_dict = {"Manhattan":1, "Brooklyn": 2, "Queens": 3, "Bronx":4, "Staten Island": 5}
# data.borough = data.borough.map(borough_dict) 
data["borough"] = data["borough"].map(borough_dict)

In [11]:
data

Unnamed: 0.1,Unnamed: 0,borough,zipcode,spc_latin,tree_diameter,wires,sidew_crack_raise,latBin,lonBin,lonDistance,latDistance,avg_health_round,avg_health
0,0,1.0,10001,86428,4,0,0,40.7485,-73.9855,5.730000e-04,0.000090,3.0,3.000000
1,1,1.0,10001,10486,10,0,0,40.7485,-73.9855,1.920000e-04,0.000247,3.0,3.000000
2,2,1.0,10001,86428,4,0,0,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
3,3,1.0,10001,86428,4,0,0,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
4,4,1.0,10001,86428,3,0,0,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008854,544364,3.0,11694,161433,16,0,1,40.5810,-73.8530,1.771800e-04,0.000005,3.0,3.000000
1008855,544365,3.0,11694,169398,11,1,1,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545
1008856,544366,3.0,11694,169398,14,1,1,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545
1008857,544367,3.0,11694,169398,14,1,0,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545


In [12]:
# Scale/Standardize data
# TODO don't need to normalize the categorical data
# TODO do we even need this? just one col is actually continuous that we are using in the end (tree diameter)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit(data) 
# data_scaled = pd.DataFrame(scaler.transform(data),columns = ??? )
# data_scaled

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[['zipcode', 'spc_latin', 'tree_diameter', 'latBin', 'lonBin', 'wires', 'sidew_crack_raise']], data.avg_health_round, test_size=0.20, random_state=0)
print(X_train)

        zipcode  spc_latin  tree_diameter   latBin   lonBin  wires  \
458540    11435     169398             16  40.6860 -73.8040      0   
756669    11226      86428              9  40.6495 -73.9495      0   
700308    11209        637              1  40.6270 -74.0240      0   
570760    10312      86428              4  40.5295 -74.1625      1   
530301    10306       1682              8  40.5740 -74.1215      0   
...         ...        ...            ...      ...      ...    ...   
963395    11422      15669              3  40.6510 -73.7295      1   
117952    10456      86428              3  40.8285 -73.9045      0   
435829    11427      86428              7  40.7305 -73.7435      0   
305711    11358     169398             22  40.7570 -73.7910      1   
985772    11433       5688              6  40.6940 -73.7870      0   

        sidew_crack_raise  
458540                  0  
756669                  1  
700308                  0  
570760                  0  
530301             

In [18]:
# Check how unbalanced dataset is
# data.avg_health.value_counts()
# data.avg_health_round.value_counts()
print(Counter(y_train))

Counter({2.0: 413649, 3.0: 370013, 1.0: 21089, 0.0: 2336})


In [19]:
# Try random oversampling
ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(X_train, y_train)
print(Counter(y_train))

Counter({2.0: 413649, 3.0: 413649, 0.0: 413649, 1.0: 413649})


In [36]:
# Try random undersampling (this performed worse)
# rus = RandomUnderSampler()
# X_train, y_train = rus.fit_resample(X_train, y_train)
# print(Counter(y_train))

Counter({0.0: 2336, 1.0: 2336, 2.0: 2336, 3.0: 2336})


# Classification


In [126]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)


LogisticRegression()

In [127]:
predictions = log_reg.predict(X_test)
# print(sklearn.metrics.classification_report(predictions, y_test))
# print("Accuracy")
# print(log_reg.score(X_test, y_test))

# print("F1 Macro")
# print(sklearn.metrics.f1_score(predictions, y_test, average='macro'))

# print("F1 weighted: ") 
# print(sklearn.metrics.f1_score(predictions, y_test, average='weighted'))

# print('Precision per class')
# print(sklearn.metrics.precision_score(predictions, y_test, average=None))

# print('Recall per class')
# print(sklearn.metrics.recall_score(predictions, y_test, average=None))

# precision, recall, fscore, support = sklearn.metrics.precision_recall_fscore_support(y_test, predictions)
# print('precision: {}'.format(precision))
# print('recall: {}'.format(recall))
# print('fscore: {}'.format(fscore))
# print('support: {}'.format(support))

report = sklearn.metrics.classification_report(y_test, predictions)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.00      0.60      0.01       603
         1.0       0.02      0.11      0.04      5173
         2.0       0.56      0.36      0.43    103354
         3.0       0.00      0.00      0.00     92642

    accuracy                           0.19    201772
   macro avg       0.15      0.27      0.12    201772
weighted avg       0.29      0.19      0.22    201772



  _warn_prf(average, modifier, msg_start, len(result))


In [128]:
# Feature influence
print('Feature influence')
print(log_reg.coef_)
# print(np.std(X_train, 0)*log_ref.coef_)


Feature influence
[[ 8.93830071e-06 -1.19162890e-06  1.49081186e-08  3.18664141e-08
  -5.73179965e-08  7.92462037e-10  7.17089123e-10]
 [-6.62261236e-06  8.94516213e-07 -2.36067606e-08 -2.48173762e-08
   4.52883483e-08 -8.40244077e-10 -1.26547880e-09]
 [-6.92264075e-06  9.25343723e-07 -1.69611846e-08 -2.53794632e-08
   4.57695693e-08 -6.67470916e-10 -1.01184263e-09]
 [ 4.60695241e-06 -6.28231035e-07  2.56598265e-08  1.83304253e-08
  -3.37399210e-08  7.15252955e-10  1.56023230e-09]]


In [20]:
from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier(n_estimators=500)
random_forest.fit(X_train,y_train)

RandomForestClassifier(n_estimators=500)

In [21]:
# Eval
predictions = random_forest.predict(X_test)

# print("Accuracy")
# print(random_forest.score(X_test, y_test))

# print("F1 Macro")
# print(sklearn.metrics.f1_score(y_test, predictions,average='macro'))

# print("F1 weighted: ") 
# print(sklearn.metrics.f1_score(y_test, predictions, average='weighted'))

# print('Precision per class')
# print(sklearn.metrics.precision_score(y_test, predictions, average=None))

# print('Recall per class')
# print(sklearn.metrics.recall_score(y_test, predictions, average=None))

report = sklearn.metrics.classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

         0.0       0.85      0.46      0.60       603
         1.0       0.73      0.55      0.63      5173
         2.0       0.80      0.80      0.80    103354
         3.0       0.78      0.79      0.78     92642

    accuracy                           0.79    201772
   macro avg       0.79      0.65      0.70    201772
weighted avg       0.79      0.79      0.79    201772



In [93]:
# Grid search cross validation
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100,500,1000], 'criterion': ['entropy', 'gini']}

grid_clf = GridSearchCV(random_forest, param_grid, scoring=['f1_weighted', 'f1_macro','accuracy'], refit='accuracy', cv=3)
grid_clf.fit(X_train, y_train)


GridSearchCV(cv=3,
             estimator=RandomForestClassifier(criterion='entropy',
                                              n_estimators=50),
             param_grid={'criterion': ['entropy', 'gini'],
                         'n_estimators': [100, 500, 1000]},
             refit='accuracy', scoring=['f1_weighted', 'f1_macro', 'accuracy'])

In [133]:
# Hyperparam grid search results and eval
grid_clf.cv_results_
grid_clf.best_estimator_
grid_clf.best_score_

import pickle
filename = 'grid_clf.best_estimator_.sav'
pickle.dump(grid_clf.best_estimator_, open(filename, 'wb'))

# predictions = grid_clf.best_estimator_.predict(X_train)
# predictions = random_forest.predict(X_train)
# # report = sklearn.metrics.classification_report(y_test, predictions)
# # print(report)
# grid_clf.best_estimator_.score(X_test, y_test)
# sklearn.metrics.accuracy_score(y_test, predictions)


In [132]:
# Feature influence
# From sklearn: The higher, the more important the feature. 
# The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. 
# It is also known as the Gini importance.

# This shows that lat and lon bin are the most important
# Zip code and species are least important
random_forest.feature_importances_

array([0.05849014, 0.09828617, 0.15330729, 0.33727531, 0.35264109])

# Regression
### (This did not work well)

In [127]:
X_train, X_test, y_train, y_test = train_test_split(data[['zipcode', 'spc_latin', 'tree_diameter', 'latBin', 'lonBin']], data.avg_health, test_size=0.20, random_state=0)

In [128]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X_train, y_train)


LinearRegression()

In [129]:
predictions = lin_reg.predict(X_test)
print("Score")
print(lin_reg.score(X_test, y_test))

print('Feature influence')
print(lin_reg.coef_)
print(np.std(X_train, 0)*lin_reg.coef_)


Score
0.011350976452213057
Feature influence
[ 2.14129907e-05 -5.64659458e-07  4.75334829e-03  1.82898365e-01
 -3.07816016e-01]
zipcode          0.010920
spc_latin       -0.035764
tree_diameter    0.045405
latBin           0.016108
lonBin          -0.039221
dtype: float64


In [93]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg=RandomForestRegressor(n_estimators=100)
random_forest_reg.fit(X_train,y_train)

RandomForestRegressor()

In [109]:
print("Score")
random_forest_reg.score(X_test, y_test)


Score


0.4181915364123192