In [5]:
import pickle
import warnings

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

tfidfconverter = TfidfVectorizer()


In [6]:
warnings.filterwarnings('ignore')

#import train and test CSV files
train = pd.read_csv("TestData_v8.csv")
predictors = train.drop(["classification"], axis=1)
target = train["classification"]
train.describe(include="all")


Unnamed: 0,classification,postal_code,street_number,street_name,street_type
count,10557,10557,10557.0,10557,10557
unique,2,5198,,2561,41
top,Residential,B0J 1N0,,SACKVILLE,RD
freq,5280,109,,189,2974
mean,,,1157.307663,,
std,,,2036.312631,,
min,,,1.0,,
25%,,,34.0,,
50%,,,137.0,,
75%,,,1410.0,,


In [7]:
# predictors['street_number'] = tfidfconverter.fit_transform(predictors['street_number']).toarray()
predictors['street_name'] = tfidfconverter.fit_transform(predictors['street_name']).toarray()
predictors['street_type'] = tfidfconverter.fit_transform(predictors['street_type']).toarray()
predictors['postal_code'] = tfidfconverter.fit_transform(predictors['postal_code']).toarray()

# testing and training split  
x_train, x_val, y_train, y_test = train_test_split(predictors, target, test_size = 0.2)  

In [None]:
# Use the random grid to search for best hyperparameters

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1800, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

In [None]:
# finding the best parameters
rf_random.best_params_



In [10]:
randomforest = RandomForestClassifier(n_estimators= 377,
min_samples_split= 2,
min_samples_leaf= 4,
max_features= 'sqrt',
max_depth= 10,
bootstrap= True)
randomforest._vectorizer = tfidfconverter
randomforest.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=377,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
y_pred = randomforest.predict(x_val)




In [12]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


[[618 424]
 [294 776]]
                 precision    recall  f1-score   support

Non Residential       0.68      0.59      0.63      1042
    Residential       0.65      0.73      0.68      1070

       accuracy                           0.66      2112
      macro avg       0.66      0.66      0.66      2112
   weighted avg       0.66      0.66      0.66      2112

0.6600378787878788


In [13]:
pickle.dump(randomforest, open('model.pkl', 'wb'))



In [44]:
loaded_model = pickle.load(open('model.pkl', 'rb'))
street_number = 55
street_name = "powers"
street_type = "dr"
postal_code = "b3v1g6"
new_df = pd.DataFrame({"postal_code":[postal_code],"street_number":[street_number], "street_name":[street_name], 'street_type' : [street_type]}) 
# new_df['street_number'] = tfidfconverter.transform(new_df['street_number']).toarray()
new_df['street_name'] = tfidfconverter.transform(new_df['street_name']).toarray()
new_df['street_type'] = tfidfconverter.transform(new_df['street_type']).toarray()
new_df['postal_code'] = tfidfconverter.transform(new_df['postal_code']).toarray()
result = str(loaded_model.predict(new_df)[0])
print(result)
prob = list(loaded_model.predict_proba(new_df)[0])
if result == "Residential":
    score = prob[1]
    # print(score)
else:
    score = prob[0]
    # print(score)
print(score)




Residential
0.6302924953642615
