In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, precision_recall_fscore_support, make_scorer
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

In [None]:
# load in data with relative path
tree_df = pd.read_csv('2015_Street_Tree_Census_-_Tree_Data.csv')

In [None]:
# drop columns that arent useful, or suffer from multicolinearity
tree_df = tree_df.drop(columns=[
    'borocode', 'x_sp', 'y_sp', 'state', 'nta_name', 'zip_city', 'address',
    'spc_latin', 'created_at', 'tree_id', 'block_id', 'user_type', 'bin',
    'bbl', 'council district', 'boro_ct', 'census tract', 'problems', 'status',
    'stump_diam', 'postcode', 'community board', 'cncldist', 'st_assem', 'st_senate', 
    'nta', 'spc_common'
])

In [None]:
# drop NA and dead tree values
tree_df = tree_df[tree_df['health'].notna()]
tree_df = tree_df.dropna(how='any')

In [None]:
# scale diameters (only continuos numerical feature left)
scaler = StandardScaler()
tree_df['tree_dbh'] = scaler.fit_transform(tree_df[['tree_dbh']])

In [None]:
# Ordinal Encode (categorical hierarchal features)
health = ['Poor', 'Fair', 'Good']
enc = OrdinalEncoder(categories=[health])
tree_df['health'] = enc.fit_transform(tree_df[['health']])

steward = ['None', '1or2', '3or4', '4orMore']
enc = OrdinalEncoder(categories=[steward])
tree_df['steward'] = enc.fit_transform(tree_df[['steward']])

sidewalk = ['NoDamage', 'Damage']
enc = OrdinalEncoder(categories=[sidewalk])
tree_df['sidewalk'] = enc.fit_transform(tree_df[['sidewalk']])

curbloc = ['OnCurb', 'OffsetFromCurb']
enc = OrdinalEncoder(categories=[curbloc])
tree_df['curb_loc'] = enc.fit_transform(tree_df[['curb_loc']])

yes_no = ['No', 'Yes']
enc = OrdinalEncoder(categories=[yes_no])

yes_no_features = [
    'brch_other', 'brch_shoe', 'brch_light', 'trnk_other', 'trnk_light',
    'trunk_wire', 'root_other', 'root_grate', 'root_stone'
]

for feat in yes_no_features:
    tree_df[feat] = enc.fit_transform(tree_df[[feat]])
    
# One Hot Encode (non-hierarchal categorical features)
enc = OneHotEncoder()
guards = enc.fit_transform(tree_df[['guards']])
tree_df['guards_' + enc.categories_[0][:]] = guards.toarray()
tree_df = tree_df.drop(columns=['guards'])

guards = enc.fit_transform(tree_df[['borough']])
tree_df['borough_' + enc.categories_[0][:]] = guards.toarray()
tree_df = tree_df.drop(columns=['borough'])

In [None]:
# split to x and y sets
y = np.asarray(tree_df['health'])
x = tree_df.drop(columns=['health'])

In [None]:
# random sample (stratified) for dev, test, train and val
x_dev, x_test, y_dev, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify = y)
x_train, x_val, y_train, y_val = train_test_split(x_dev, y_dev, test_size=0.2, random_state=42, stratify = y_dev)

In [None]:
# over-sampling of training data
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(x_train, y_train)

In [None]:
# under sampling of training data
rus = RandomUnderSampler(random_state=42)
x_rus, y_rus = rus.fit_resample(x_train, y_train)

In [None]:
# smote sampling of training data
smote = SMOTE(random_state=42)
x_smote, y_smote = smote.fit_resample(x_train, y_train)

Log Reg

In [None]:
# model evaluation function
def lr_result(model, X_train, y_train, X_test, y_test, report = False):
    # predict 
    y_train_predict = model.predict(X_train)
    y_test_predict = model.predict(X_test)
    
    # print trainind and testing accuracy
    print("Accuracy of Training data: ", model.score(x_train,y_train))
    print("Accuracy of Testing data: ", model.score(x_test,y_test))
    print("Accuracy of Training data: ", f1_score(y_train, y_train_predict, average='weighted'))
    print("Accuracy of Testing data: ", f1_score(y_test, y_test_predict, average='weighted'))
    
    # show result report
    if (report):
        print(classification_report(y_test, y_test_predict))
        
    # show confusion_matrix
    cf = confusion_matrix(y_test, y_test_predict)
    sns.heatmap(cf, annot = True, fmt = 'g')
    plt.show()

In [None]:
def feature_importance(x_train, model):
  features = x_train.columns.values
  plt.figure(figsize = (12, 6))
  ax = sns.barplot(x = features, y = model.best_estimator_.feature_importances_)
  ax.tick_params(axis = 'x', rotation = 90)

In [None]:
# base SVM model
lr = LogisticRegression(random_state = 2022)

# random search for hyperparameter tuning
param_list = {'C': [.1, 1, 10, 50, 100], 
              'tol': [.0001, .001, .01, .1, 1], 
              'max_iter': [10, 20, 50, 100]}

lr_rand_search_ac = RandomizedSearchCV(lr, 
                                       param_distributions = param_list, 
                                       cv = 5,
                                       scoring = 'accuracy',
                                       n_jobs=-1,
                                       random_state = 2022) 

lr_rand_search_f1 = RandomizedSearchCV(lr, 
                                       param_distributions = param_list, 
                                       cv = 5,
                                       scoring = 'f1_macro',
                                       n_jobs=-1,
                                       random_state = 2022) 

lr_rand_search_recall = RandomizedSearchCV(lr, 
                                           param_distributions = param_list, 
                                           cv = 5,scoring = 'recall_macro',
                                           n_jobs=-1,
                                           random_state = 2022) 

In [None]:
# Hyperparameter tuning random stratified sampling
print('Random Stratified Sampling')
# accuracy as scoring metric
print('Accuracy')
lr_rand_search_ac.fit(x_train, y_train) 
feature_importance(x_train, lr_rand_search_ac)
print('Best Parameters:', lr_rand_search_ac.best_params_)
lr_best = lr_rand_search_ac.best_estimator_
lr_result(lr_best, x_train, y_train, x_test, y_test)

# f1 as scoring metric
print('F1 Score')
lr_rand_search_f1.fit(x_train, y_train) 
feature_importance(x_train, lr_rand_search_f1)
print('Best Parameters:', lr_rand_search_f1.best_params_)
lr_best = lr_rand_search_f1.best_estimator_
lr_result(lr_best, x_train, y_train, x_test, y_test)

# recall as scoring metric
print('Recall')
lr_rand_search_recall.fit(x_train, y_train) 
feature_importance(x_train, lr_rand_search_recall)
print('Best Parameters:', lr_rand_search_recall.best_params_)
lr_best = lr_rand_search_recall.best_estimator_
lr_result(lr_best, x_train, y_train, x_test, y_test)

In [None]:
# Hyperparameter tuning random over sampling
print('Random Over Sampling')
# accuracy as scoring metric
print('Accuracy')
lr_rand_search_ac.fit(x_ros, y_ros) 
feature_importance(x_ros, lr_rand_search_ac)
print('Best Parameters:', lr_rand_search_ac.best_params_)
lr_best = lr_rand_search_ac.best_estimator_
lr_result(lr_best, x_ros, y_ros, x_test, y_test)

# f1 as scoring metric
print('F1 Score')
lr_rand_search_f1.fit(x_ros, y_ros) 
feature_importance(x_ros, lr_rand_search_f1)
print('Best Parameters:', lr_rand_search_f1.best_params_)
lr_best = lr_rand_search_f1.best_estimator_
lr_result(lr_best, x_ros, y_ros, x_test, y_test)

# recall as scoring metric
print('Recall')
lr_rand_search_recall.fit(x_ros, y_ros)
feature_importance(x_ros, lr_rand_search_recall)
print('Best Parameters:', lr_rand_search_recall.best_params_)
lr_best = lr_rand_search_recall.best_estimator_
lr_result(lr_best, x_ros, y_ros, x_test, y_test)

In [None]:
# Hyperparameter tuning random under sampling
print('Random Under Sampling')
# accuracy as scoring metric
print('Accuracy')
lr_rand_search_ac.fit(x_rus, y_rus)
feature_importance(x_rus, lr_rand_search_ac)
print('Best Parameters:', lr_rand_search_ac.best_params_)
lr_best = lr_rand_search_ac.best_estimator_
lr_result(lr_best, x_rus, y_rus, x_test, y_test)

# f1 as scoring metric
print('F1 Score')
lr_rand_search_f1.fit(x_rus, y_rus) 
feature_importance(x_rus, lr_rand_search_f1)
print('Best Parameters:', lr_rand_search_f1.best_params_)
lr_best = lr_rand_search_f1.best_estimator_
lr_result(lr_best, x_rus, y_rus, x_test, y_test)

# recall as scoring metric
print('Recall')
lr_rand_search_recall.fit(x_rus, y_rus) 
feature_importance(x_rus, lr_rand_search_recall)
print('Best Parameters:', lr_rand_search_recall.best_params_)
lr_best = lr_rand_search_recall.best_estimator_
lr_result(lr_best, x_rus, y_rus, x_test, y_test)

In [None]:
# Hyperparameter tuning random over sampling
print('SMOTE Sampling')
# accuracy as scoring metric
print('Accuracy')
lr_rand_search_ac.fit(x_smote, y_smote) 
feature_importance(x_smote, lr_rand_search_ac)
print('Best Parameters:', lr_rand_search_ac.best_params_)
lr_best = lr_rand_search_ac.best_estimator_
lr_result(lr_best, x_smote, y_smote, x_test, y_test)

# f1 as scoring metric
print('F1 Score')
lr_rand_search_f1.fit(x_smote, y_smote) 
feature_importance(x_smote, lr_rand_search_f1)
print('Best Parameters:', lr_rand_search_f1.best_params_)
lr_best = lr_rand_search_f1.best_estimator_
lr_result(lr_best, x_smote, y_smote, x_test, y_test)

# recall as scoring metric
print('Recall')
lr_rand_search_recall.fit(x_smote, y_smote) 
feature_importance(x_smote, lr_rand_search_recall)
print('Best Parameters:', lr_rand_search_recall.best_params_)
lr_best = lr_rand_search_recall.best_estimator_
lr_result(lr_best, x_smote, y_smote, x_test, y_test)