In [81]:
import pandas as pd
import csv
import numpy as np
import sklearn
import matplotlib.pyplot as plt


In [114]:
data = pd.read_csv('data/parsed_data.csv')


In [115]:
# Check how unbalanced dataset is
data.avg_health.value_counts()
# data.avg_health_round.value_counts()

3.000000    211722
2.000000    198186
2.500000     41560
2.666667     28065
2.750000     21896
             ...  
2.836735         1
2.342105         1
2.525000         1
2.022727         1
2.763158         1
Name: avg_health, Length: 882, dtype: int64

In [116]:
# Encode tree species as their frequency count rather than one hot encoding, since there are 100s of speicies
# Loss of info, but it's a tradeoff
data.spc_latin = data.spc_latin.map(data.spc_latin.value_counts()) 
data

Unnamed: 0.1,Unnamed: 0,borough,zipcode,spc_latin,tree_diameter,latBin,lonBin,lonDistance,latDistance,avg_health_round,avg_health
0,0,Manhattan,10001,86428,4,40.7485,-73.9855,5.730000e-04,0.000090,3.0,3.000000
1,1,Manhattan,10001,10486,10,40.7485,-73.9855,1.920000e-04,0.000247,3.0,3.000000
2,2,Manhattan,10001,86428,4,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
3,3,Manhattan,10001,86428,4,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
4,4,Manhattan,10001,86428,3,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
...,...,...,...,...,...,...,...,...,...,...,...
1008854,544364,Queens,11694,161433,16,40.5810,-73.8530,1.771800e-04,0.000005,3.0,3.000000
1008855,544365,Queens,11694,169398,11,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545
1008856,544366,Queens,11694,169398,14,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545
1008857,544367,Queens,11694,169398,14,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545


In [121]:
# Encode borough as number
borough_dict = {"Manhattan":1, "Brooklyn": 2, "Queens": 3, "Bronx":4, "Staten Island": 5}
# data.borough = data.borough.map(borough_dict) 
data["borough"] = data["borough"].map(borough_dict)

Unnamed: 0.1,Unnamed: 0,borough,zipcode,spc_latin,tree_diameter,latBin,lonBin,lonDistance,latDistance,avg_health_round,avg_health
0,0,1.0,10001,86428,4,40.7485,-73.9855,5.730000e-04,0.000090,3.0,3.000000
1,1,1.0,10001,10486,10,40.7485,-73.9855,1.920000e-04,0.000247,3.0,3.000000
2,2,1.0,10001,86428,4,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
3,3,1.0,10001,86428,4,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
4,4,1.0,10001,86428,3,40.7485,-73.9855,8.910000e-04,0.000042,3.0,3.000000
...,...,...,...,...,...,...,...,...,...,...,...
1008854,544364,3.0,11694,161433,16,40.5810,-73.8530,1.771800e-04,0.000005,3.0,3.000000
1008855,544365,3.0,11694,169398,11,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545
1008856,544366,3.0,11694,169398,14,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545
1008857,544367,3.0,11694,169398,14,40.5790,-73.8455,8.304000e-05,0.000279,2.0,2.454545


In [123]:
# Normalizing data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data) 
data_scaled = pd.DataFrame(scaler.transform(data),columns = data.columns)
data_scaled

Unnamed: 0.1,Unnamed: 0,borough,zipcode,spc_latin,tree_diameter,latBin,lonBin,lonDistance,latDistance,avg_health_round,avg_health
0,-1.711120,-1.732490,-1.885789,0.064052,-0.857056,0.581461,-0.506355,2.249844,-0.703413,1.022753,1.193242
1,-1.711114,-1.732490,-1.885789,-1.135102,-0.229356,0.581461,-0.506355,-0.099468,0.274502,1.022753,1.193242
2,-1.711107,-1.732490,-1.885789,0.064052,-0.857056,0.581461,-0.506355,4.210687,-1.002393,1.022753,1.193242
3,-1.711100,-1.732490,-1.885789,0.064052,-0.857056,0.581461,-0.506355,4.210687,-1.002393,1.022753,1.193242
4,-1.711093,-1.732490,-1.885789,0.064052,-0.961673,0.581461,-0.506355,4.210687,-1.002393,1.022753,1.193242
...,...,...,...,...,...,...,...,...,...,...,...
1008854,1.959050,0.028636,1.435117,1.248411,0.398344,-1.320512,0.533342,-0.190851,-1.232545,1.022753,1.193242
1008855,1.959057,0.028636,1.435117,1.374181,-0.124740,-1.343222,0.592193,-0.771335,0.476687,-0.761329,0.099594
1008856,1.959063,0.028636,1.435117,1.374181,0.189110,-1.343222,0.592193,-0.771335,0.476687,-0.761329,0.099594
1008857,1.959070,0.028636,1.435117,1.374181,0.189110,-1.343222,0.592193,-0.771335,0.476687,-0.761329,0.099594


# Classification


In [124]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[['zipcode', 'spc_latin', 'tree_diameter', 'latBin', 'lonBin']], data.avg_health_round, test_size=0.20, random_state=0)


In [125]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)


LogisticRegression()

In [126]:
predictions = log_reg.predict(X_test)
# print(sklearn.metrics.classification_report(predictions, y_test))
print("Accuracy")
print(log_reg.score(X_test, y_test))

print("F1 Macro")
print(sklearn.metrics.f1_score(predictions, y_test, average='macro'))

print("F1 weighted: ") 
print(sklearn.metrics.f1_score(predictions, y_test, average='weighted'))

print('Precision per class')
print(sklearn.metrics.precision_score(predictions, y_test, average=None))

print('Recall per class')
print(sklearn.metrics.recall_score(predictions, y_test, average=None))

Accuracy
0.5141595464187301
F1 Macro
0.22426943064215699
F1 weighted: 
0.5822163555792391
Precision per class
[0.         0.         0.85138456 0.16999849]
Recall per class
[0.         0.         0.51851459 0.49111264]


  _warn_prf(average, modifier, msg_start, len(result))


In [131]:
# Feature influence
print('Feature influence')
print(log_reg.coef_)
# print(np.std(X_train, 0)*log_ref.coef_)


Feature influence
[[-2.77116772e-04 -1.03889425e-06 -2.18172155e-07 -1.04088725e-06
   1.89083628e-06]
 [-9.02365651e-05  1.00639531e-06 -1.49545755e-07 -3.59859710e-07
   6.56527558e-07]
 [ 1.82790914e-04  8.17103382e-07 -1.00022108e-06  6.77838710e-07
  -1.22820187e-06]
 [ 1.84562424e-04 -7.84604447e-07  1.36793899e-06  7.22908248e-07
  -1.31916197e-06]]


In [88]:
from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train,y_train)

RandomForestClassifier()

In [89]:
# predictions = random_forest.predict(X_test)
random_forest.score(X_test, y_test)


0.7768322661221577

In [90]:
# Eval
print("Accuracy")
print(random_forest.score(X_test, y_test))

print("F1 Macro")
print(sklearn.metrics.f1_score(predictions, y_test, average='macro'))

print("F1 weighted: ") 
print(sklearn.metrics.f1_score(predictions, y_test, average='weighted'))

print('Precision per class')
print(sklearn.metrics.precision_score(predictions, y_test, average=None))

print('Recall per class')
print(sklearn.metrics.recall_score(predictions, y_test, average=None))

Accuracy
0.7768322661221577
F1 Macro
0.6645672309596108
F1 weighted: 
0.778141631357517
Precision per class
[0.38474295 0.46742702 0.80440041 0.76590531]
Recall per class
[0.79725086 0.75515303 0.78082912 0.77288819]


In [132]:
# Feature influence
# From sklearn: The higher, the more important the feature. 
# The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. 
# It is also known as the Gini importance.

# This shows that lat and lon bin are the most important
# Zip code and species are least important
random_forest.feature_importances_

array([0.05849014, 0.09828617, 0.15330729, 0.33727531, 0.35264109])

# Regression

In [127]:
X_train, X_test, y_train, y_test = train_test_split(data[['zipcode', 'spc_latin', 'tree_diameter', 'latBin', 'lonBin']], data.avg_health, test_size=0.20, random_state=0)

In [128]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X_train, y_train)


LinearRegression()

In [129]:
predictions = lin_reg.predict(X_test)
print("Score")
print(lin_reg.score(X_test, y_test))

print('Feature influence')
print(lin_reg.coef_)
print(np.std(X_train, 0)*lin_reg.coef_)


Score
0.011350976452213057
Feature influence
[ 2.14129907e-05 -5.64659458e-07  4.75334829e-03  1.82898365e-01
 -3.07816016e-01]
zipcode          0.010920
spc_latin       -0.035764
tree_diameter    0.045405
latBin           0.016108
lonBin          -0.039221
dtype: float64


In [93]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg=RandomForestRegressor(n_estimators=100)
random_forest_reg.fit(X_train,y_train)

RandomForestRegressor()

In [109]:
print("Score")
random_forest_reg.score(X_test, y_test)


Score


0.4181915364123192