# Preprocessing
From Kiyan

Hey everyone, just pushed a preprocessing script.
you guys can decide what to use for the particular models, but there is a stratified dev and test set. for those that want to use the imbalanced sampling sets, I sampled the training set NOT THE DEV SET. I would reccomed to try to use train, val and test for all the models instead of just the dev and test.

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# load in data with relative path
tree_df = pd.read_csv('2015_Street_Tree_Census_-_Tree_Data.csv')

In [3]:
# drop columns
tree_df = tree_df.drop(columns=[
    'borocode', 'x_sp', 'y_sp', 'state', 'nta_name', 'zip_city', 'address',
    'spc_latin', 'created_at', 'tree_id', 'block_id', 'user_type', 'bin',
    'bbl', 'council district', 'boro_ct', 'census tract', 'problems', 'status',
    'stump_diam', 'postcode', 'community board', 'cncldist', 'st_assem', 'st_senate', 
    'nta', 'spc_common'
])

# drop NA and dead tree values
tree_df = tree_df[tree_df['health'].notna()]
tree_df = tree_df.dropna(how='any')

In [4]:
# scale diameters 
scaler = StandardScaler()
tree_df['tree_dbh'] = scaler.fit_transform(tree_df[['tree_dbh']])

# Ordinal Encode
health = ['Poor', 'Fair', 'Good']
enc = OrdinalEncoder(categories=[health])
tree_df['health'] = enc.fit_transform(tree_df[['health']])

steward = ['None', '1or2', '3or4', '4orMore']
enc = OrdinalEncoder(categories=[steward])
tree_df['steward'] = enc.fit_transform(tree_df[['steward']])

sidewalk = ['NoDamage', 'Damage']
enc = OrdinalEncoder(categories=[sidewalk])
tree_df['sidewalk'] = enc.fit_transform(tree_df[['sidewalk']])

curbloc = ['OnCurb', 'OffsetFromCurb']
enc = OrdinalEncoder(categories=[curbloc])
tree_df['curb_loc'] = enc.fit_transform(tree_df[['curb_loc']])

yes_no = ['No', 'Yes']
enc = OrdinalEncoder(categories=[yes_no])

yes_no_features = [
    'brch_other', 'brch_shoe', 'brch_light', 'trnk_other', 'trnk_light',
    'trunk_wire', 'root_other', 'root_grate', 'root_stone'
]

for feat in yes_no_features:
    tree_df[feat] = enc.fit_transform(tree_df[[feat]])
    
# One Hot Encode
enc = OneHotEncoder()
guards = enc.fit_transform(tree_df[['guards']])
tree_df['guards_' + enc.categories_[0][:]] = guards.toarray()
tree_df = tree_df.drop(columns=['guards'])

guards = enc.fit_transform(tree_df[['borough']])
tree_df['borough_' + enc.categories_[0][:]] = guards.toarray()
tree_df = tree_df.drop(columns=['borough'])

In [5]:
# split to x and y sets
y = np.asarray(tree_df['health'])
x = tree_df.drop(columns=['health'])

In [6]:
# random sample (stratified) for dev, test, train and val
x_dev, x_test, y_dev, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify = y)
x_train, x_val, y_train, y_val = train_test_split(x_dev, y_dev, test_size=0.2, random_state=42, stratify = y_dev)

In [7]:
# over-sampling of training data
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(x_train, y_train)

In [8]:
# under sampling of training data
rus = RandomUnderSampler(random_state=42)
x_rus, y_rus = rus.fit_resample(x_train, y_train)

In [9]:
# smote sampling of training data
smote = SMOTE(random_state=42)
x_smote, y_smote = smote.fit_resample(x_train, y_train)

# Model: SVM

In [38]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, make_scorer, roc_auc_score, classification_report
from sklearn.svm import LinearSVC, SVC
import seaborn as sns
"""
A short cut to print the svm model result.
    Args:
        svm: the svm model.
        X_train: training data of X.
        y_train: training data of y.
        X_test: testing data of X.   
        y_test: testing data of y.
        report: boolean; print the classification report if true, default as "True".
        cf: boolean; print the confusion matrix if true, default as "True".
    Returns:
        Print the training and the testing accuracy.
        Return the classification report if report = True.
        Return the confusion matrix if report = True.
"""
def SVM_result(svm, X_train, y_train, X_test, y_test, report_test = True, report_train = False, cf = False):
    # fit the model
    svm.fit(X_train, y_train)
    # predict y
    y_train_predict = svm.predict(X_train)
    y_test_predict = svm.predict(X_test)    
    # show result report on testing data
    if (report_test):
        print("Result on test data:")
        print(classification_report(y_test, y_test_predict))
    # show result report on training data
    if (report_train):
        print("Result on training data:")
        print(classification_report(y_train, y_train_predict))
    # show confusion_matrix
    if (cf):
        cf = confusion_matrix(y_test, y_test_predict)
        sns.heatmap(cf, annot = True, fmt = 'g')

## Modeling and hyperparameter tuning for different sampling

In [47]:
import warnings
from scipy import stats
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

# Basic SVM models
warnings.filterwarnings('ignore')
# primal svm (for large data set) before hyperparameter tuning
svm = LinearSVC(tol=0.0001, dual = False, random_state = 2022) 
# random search for hyperparameter tuning
param_list = {'C': stats.uniform(0, 5), 
              'loss': ['hinge', 'squared_hinge']} # random list for hyperparameter tuning 
rand_search_ac = RandomizedSearchCV(svm, 
                                    param_distributions = param_list, 
                                    cv = 5,
                                    scoring = 'accuracy',
                                    random_state = 2022) 

rand_search_f1 = RandomizedSearchCV(svm, 
                                    param_distributions = param_list, 
                                    cv = 5,
                                    scoring = 'f1_macro',
                                    random_state = 2022) 

rand_search_recall = RandomizedSearchCV(svm, 
                                        param_distributions = param_list, 
                                        cv = 5,
                                        scoring = 'recall_macro',
                                        random_state = 2022) 

### SVM for original model

In [49]:
# Hyperparameter tuning random Sampling

# accuracy as scoring metric
rand_search_ac.fit(x_train, y_train)
print(f"Best parameters for tuning on accuray: {rand_search_ac.best_params_}")
SVM_result(rand_search_ac.best_estimator_, x_train, y_train, x_test, y_test)

# f1 as scoring metric
rand_search_f1.fit(x_train, y_train) 
print(f"Best parameters for tuning on f1: {rand_search_f1.best_params_}")
SVM_result(rand_search_f1.best_estimator_, x_train, y_train, x_test, y_test)

# macro-recall as scoring metric
rand_search_recall.fit(x_train, y_train) 
print(f"Best parameters for tuning on macro recall: {rand_search_recall.best_params_}")
SVM_result(rand_search_recall.best_estimator_, x_train, y_train, x_test, y_test)

Best parameters for tuning on accuray: {'C': 4.620282385385153, 'loss': 'squared_hinge'}
Result on test data:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      5363
         1.0       0.35      0.01      0.02     19301
         2.0       0.81      1.00      0.90    105770

    accuracy                           0.81    130434
   macro avg       0.39      0.34      0.31    130434
weighted avg       0.71      0.81      0.73    130434

Best parameters for tuning on f1: {'C': 4.620282385385153, 'loss': 'squared_hinge'}
Result on test data:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      5363
         1.0       0.35      0.01      0.02     19301
         2.0       0.81      1.00      0.90    105770

    accuracy                           0.81    130434
   macro avg       0.39      0.34      0.31    130434
weighted avg       0.71      0.81      0.73    130434

Best parameters for tun

### SVM after undersampling 

In [48]:
# Hyperparameter tuning under sampling

# accuracy as scoring metric
rand_search_ac.fit(x_rus, y_rus)
print(f"Best parameters for tuning on accuray: {rand_search_ac.best_params_}")
SVM_result(rand_search_ac.best_estimator_, x_rus, y_rus, x_test, y_test)

# f1 as scoring metric
rand_search_f1.fit(x_rus, y_rus) 
print(f"Best parameters for tuning on f1: {rand_search_f1.best_params_}")
SVM_result(rand_search_f1.best_estimator_, x_rus, y_rus, x_test, y_test)

# macro-recall as scoring metric
rand_search_recall.fit(x_rus, y_rus) 
print(f"Best parameters for tuning on macro recall: {rand_search_recall.best_params_}")
SVM_result(rand_search_recall.best_estimator_, x_rus, y_rus, x_test, y_test)

Best parameters for tuning on accuray: {'C': 0.04679306903882352, 'loss': 'squared_hinge'}
Result on test data:
              precision    recall  f1-score   support

         0.0       0.07      0.53      0.12      5363
         1.0       0.18      0.16      0.17     19301
         2.0       0.85      0.56      0.68    105770

    accuracy                           0.50    130434
   macro avg       0.36      0.42      0.32    130434
weighted avg       0.72      0.50      0.58    130434

Best parameters for tuning on f1: {'C': 0.04679306903882352, 'loss': 'squared_hinge'}
Result on test data:
              precision    recall  f1-score   support

         0.0       0.07      0.53      0.12      5363
         1.0       0.18      0.16      0.17     19301
         2.0       0.85      0.56      0.68    105770

    accuracy                           0.50    130434
   macro avg       0.36      0.42      0.32    130434
weighted avg       0.72      0.50      0.58    130434

Best parameters for

### SVM after oversampling 

In [50]:
# Hyperparameter tuning over sampling

# accuracy as scoring metric
rand_search_ac.fit(x_ros, y_ros)
print(f"Best parameters for tuning on accuray: {rand_search_ac.best_params_}")
SVM_result(rand_search_ac.best_estimator_, x_ros, y_ros, x_test, y_test)

# f1 as scoring metric
rand_search_f1.fit(x_ros, y_ros) 
print(f"Best parameters for tuning on f1: {rand_search_f1.best_params_}")
SVM_result(rand_search_f1.best_estimator_, x_ros, y_ros, x_test, y_test)

# macro-recall as scoring metric
rand_search_recall.fit(x_ros, y_ros) 
print(f"Best parameters for tuning on macro recall: {rand_search_recall.best_params_}")
SVM_result(rand_search_recall.best_estimator_, x_ros, y_ros, x_test, y_test)

Best parameters for tuning on accuray: {'C': 4.620282385385153, 'loss': 'squared_hinge'}
Result on test data:
              precision    recall  f1-score   support

         0.0       0.07      0.53      0.12      5363
         1.0       0.18      0.15      0.16     19301
         2.0       0.85      0.57      0.68    105770

    accuracy                           0.51    130434
   macro avg       0.37      0.42      0.32    130434
weighted avg       0.72      0.51      0.58    130434

Best parameters for tuning on f1: {'C': 2.5660378900685785, 'loss': 'squared_hinge'}
Result on test data:
              precision    recall  f1-score   support

         0.0       0.07      0.53      0.12      5363
         1.0       0.18      0.15      0.16     19301
         2.0       0.85      0.57      0.68    105770

    accuracy                           0.51    130434
   macro avg       0.37      0.42      0.32    130434
weighted avg       0.72      0.51      0.58    130434

Best parameters for tu

### SVM after smote

In [51]:
# Hyperparameter tuning after smote sampling

# accuracy as scoring metric
rand_search_ac.fit(x_smote, y_smote)
print(f"Best parameters for tuning on accuray: {rand_search_ac.best_params_}")
SVM_result(rand_search_ac.best_estimator_, x_smote, y_smote, x_test, y_test)

# f1 as scoring metric
rand_search_f1.fit(x_smote, y_smote) 
print(f"Best parameters for tuning on f1: {rand_search_f1.best_params_}")
SVM_result(rand_search_f1.best_estimator_, x_smote, y_smote, x_test, y_test)

# macro-recall as scoring metric
rand_search_recall.fit(x_smote, y_smote) 
print(f"Best parameters for tuning on macro recall: {rand_search_recall.best_params_}")
SVM_result(rand_search_recall.best_estimator_, x_smote, y_smote, x_test, y_test)

Best parameters for tuning on accuray: {'C': 0.04679306903882352, 'loss': 'squared_hinge'}
Result on test data:
              precision    recall  f1-score   support

         0.0       0.06      0.54      0.12      5363
         1.0       0.18      0.16      0.17     19301
         2.0       0.85      0.56      0.67    105770

    accuracy                           0.50    130434
   macro avg       0.37      0.42      0.32    130434
weighted avg       0.72      0.50      0.58    130434

Best parameters for tuning on f1: {'C': 1.5080409805315753, 'loss': 'squared_hinge'}
Result on test data:
              precision    recall  f1-score   support

         0.0       0.06      0.54      0.12      5363
         1.0       0.18      0.16      0.17     19301
         2.0       0.85      0.56      0.67    105770

    accuracy                           0.50    130434
   macro avg       0.37      0.42      0.32    130434
weighted avg       0.72      0.50      0.58    130434

Best parameters for 