# Preprocessing
From Kiyan

Hey everyone, just pushed a preprocessing script.
you guys can decide what to use for the particular models, but there is a stratified dev and test set. for those that want to use the imbalanced sampling sets, I sampled the training set NOT THE DEV SET. I would reccomed to try to use train, val and test for all the models instead of just the dev and test.

In [22]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [23]:
# load in data with relative path
tree_df = pd.read_csv('2015_Street_Tree_Census_-_Tree_Data.csv')

In [24]:
# drop columns
tree_df = tree_df.drop(columns=[
    'borocode', 'x_sp', 'y_sp', 'state', 'nta_name', 'zip_city', 'address',
    'spc_latin', 'created_at', 'tree_id', 'block_id', 'user_type', 'bin',
    'bbl', 'council district', 'boro_ct', 'census tract', 'problems', 'status',
    'stump_diam', 'postcode', 'community board', 'cncldist', 'st_assem', 'st_senate', 
    'nta', 'spc_common'
])

# drop NA and dead tree values
tree_df = tree_df[tree_df['health'].notna()]
tree_df = tree_df.dropna(how='any')

In [25]:
# scale diameters 
scaler = StandardScaler()
tree_df['tree_dbh'] = scaler.fit_transform(tree_df[['tree_dbh']])

# Ordinal Encode
health = ['Poor', 'Fair', 'Good']
enc = OrdinalEncoder(categories=[health])
tree_df['health'] = enc.fit_transform(tree_df[['health']])

steward = ['None', '1or2', '3or4', '4orMore']
enc = OrdinalEncoder(categories=[steward])
tree_df['steward'] = enc.fit_transform(tree_df[['steward']])

sidewalk = ['NoDamage', 'Damage']
enc = OrdinalEncoder(categories=[sidewalk])
tree_df['sidewalk'] = enc.fit_transform(tree_df[['sidewalk']])

curbloc = ['OnCurb', 'OffsetFromCurb']
enc = OrdinalEncoder(categories=[curbloc])
tree_df['curb_loc'] = enc.fit_transform(tree_df[['curb_loc']])

yes_no = ['No', 'Yes']
enc = OrdinalEncoder(categories=[yes_no])

yes_no_features = [
    'brch_other', 'brch_shoe', 'brch_light', 'trnk_other', 'trnk_light',
    'trunk_wire', 'root_other', 'root_grate', 'root_stone'
]

for feat in yes_no_features:
    tree_df[feat] = enc.fit_transform(tree_df[[feat]])
    
# One Hot Encode
enc = OneHotEncoder()
guards = enc.fit_transform(tree_df[['guards']])
tree_df['guards_' + enc.categories_[0][:]] = guards.toarray()
tree_df = tree_df.drop(columns=['guards'])

guards = enc.fit_transform(tree_df[['borough']])
tree_df['borough_' + enc.categories_[0][:]] = guards.toarray()
tree_df = tree_df.drop(columns=['borough'])

In [26]:
# split to x and y sets
y = np.asarray(tree_df['health'])
x = tree_df.drop(columns=['health'])

In [27]:
# random sample (stratified) for dev, test, train and val
x_dev, x_test, y_dev, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify = y)
x_train, x_val, y_train, y_val = train_test_split(x_dev, y_dev, test_size=0.2, random_state=42, stratify = y_dev)

In [28]:
# over-sampling of training data
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(x_train, y_train)

In [29]:
# under sampling of training data
rus = RandomUnderSampler(random_state=42)
x_rus, y_rus = rus.fit_resample(x_train, y_train)

In [30]:
# smote sampling of training data
smote = SMOTE(random_state=42)
x_smote, y_smote = smote.fit_resample(x_train, y_train)

# Model: SVM

In [31]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, make_scorer, roc_auc_score
from sklearn.svm import LinearSVC, SVC
import seaborn as sns
# print the training and testing accuracy of a SVM model and plot confusion matrix is cf = True
def SVM_result(svm, X_train, y_train, X_test, y_test, report = False, cf = False):
    # fit the model
    svm.fit(X_train, y_train)
    # predict y
    y_train_predict = svm.predict(X_train)
    y_test_predict = svm.predict(X_test)    
    # accuracy scores
    acc_predit_train = accuracy_score(y_train_predict,y_train)
    acc_predit_test = accuracy_score(y_test_predict,y_test)
    # print trainind and testing accuracy
    print("Accuracy of Training data: ", acc_predit_train)
    print("Accuracy of Testing data: ", acc_predit_test)
    # show result report
    if (report):
        print(classification_report(y_test, y_test_predict))
    # show confusion_matrix
    if (cf):
        cf = confusion_matrix(y_test, primal_svm.predict(X_test))
        sns.heatmap(cf, annot = True, fmt = 'g')

In [32]:
#np.unique(y_test, return_counts=True) #[  5363,  19301, 105770]
#len(y_test) # 130434
#(105770 / 130434) # baseline model 0.8109081987825261

## Modeling and hyperparameter tuning for different sampling

In [33]:
import warnings
warnings.filterwarnings('ignore')
# basic primal and dual svm before hyperparameter tuning
primal_svm = LinearSVC(tol=0.0001, dual = False) 
dual_svm = SVC(kernel = "linear")
# random search for hyperparameter tuning
rand_list = {"C": stats.uniform(0, 5)} # random list for hyperparameter tuning
rand_search = RandomizedSearchCV(primal_svm, 
                                 param_distributions = rand_list, 
                                 cv = 5, 
                                 random_state = 2022) 

### SVM for original model

In [38]:
# tree health: 2:good; 1:fair; 0:poor
from scipy import stats
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

rand_search.fit(x_train, y_train) 
primal_svm_best = rand_search.best_estimator_
print("Primal SVM for original model:")
print(primal_svm_best)
SVM_result(primal_svm_best, x_train, y_train, x_test, y_test)

Primal SVM for original model:
LinearSVC(C=3.6056746458234272, dual=False)
Accuracy of Training data:  0.8108522525803329
Accuracy of Testing data:  0.8109081987825261


### SVM after undersampling 

In [39]:
# Hyperparameter tuning
rand_search.fit(x_rus, y_rus) 
primal_svm_best = rand_search.best_estimator_
print("Primal SVM after undersampling:")
print(primal_svm_best)
SVM_result(primal_svm_best, x_rus, y_rus, x_test, y_test)

Primal SVM after undersampling:
LinearSVC(C=0.04679306903882352, dual=False)
Accuracy of Training data:  0.41818640874749946
Accuracy of Testing data:  0.502269346949415


### SVM after oversampling 

In [40]:
print("Primal SVM after oversampling:")
rand_list = {"C": stats.uniform(0, 5)}
rand_search = RandomizedSearchCV(primal_svm, 
                                 param_distributions = rand_list, 
                                 cv = 5, 
                                 random_state = 2022) 
rand_search.fit(x_ros, y_ros)
primal_svm_best = rand_search.best_estimator_
print("Primal SVM for original model:")
print(primal_svm_best)
SVM_result(primal_svm, x_ros, y_ros, x_test, y_test, cf = False)

Primal SVM after oversampling:
Primal SVM for original model:
LinearSVC(C=0.24987009088815237, dual=False)
Accuracy of Training data:  0.41854698051682654
Accuracy of Testing data:  0.5074980449882699


### SVM after smote

In [41]:
print("Primal SVM after smote:")
rand_search = RandomizedSearchCV(primal_svm, 
                                 param_distributions = rand_list, 
                                 cv = 5, 
                                 random_state = 2022) 
rand_search.fit(x_smote, y_smote)
primal_svm_best = rand_search.best_estimator_
print("Primal SVM for original model:")
print(primal_svm_best)
SVM_result(primal_svm, x_smote, y_smote, x_test, y_test, cf = False)

Primal SVM after smote:
Primal SVM for original model:
LinearSVC(C=2.4952890542599397, dual=False)
Accuracy of Training data:  0.4181934214374983
Accuracy of Testing data:  0.49610530996519314
