In [1]:
# Initial imports.
import sqlalchemy
import numpy as np
from pathlib import Path
from collections import Counter
import pandas as pd
from path import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine

In [2]:
# Loading data
file_path = Path("../Data/final_table_ml_stage2.csv")
risk_df = pd.read_csv(file_path)
risk_df.head()

Unnamed: 0,business_postal_code_1,inspection_id,inspection_date,inspection_score,violation_description,risk_category,neighborhoods,Current Police Districts,population,avg_income
0,94133,835_20180917,September,88,Improper food storage,Low Risk,107,6,26827,"$40,990.00"
1,94108,905_20190415,April,87,High risk vermin infestation,High Risk,19,6,13716,"$31,542.00"
2,94118,1203_20170803,August,77,Moderate risk food holding temperature,Moderate Risk,5,8,38939,"$61,609.00"
3,94109,1345_20170928,September,81,Improper cooling methods,High Risk,105,4,56322,"$43,444.00"
4,94114,1352_20180620,June,74,Non service animal,Low Risk,38,3,30574,"$75,727.00"


In [3]:
# Create our features
X = risk_df.drop(columns='risk_category')

# Create our target
y = risk_df['risk_category']
y.value_counts()

Low Risk         9820
Moderate Risk    7599
High Risk        2825
Name: risk_category, dtype: int64

In [4]:
X.describe()

Unnamed: 0,business_postal_code_1,inspection_score,neighborhoods,Current Police Districts
count,20244.0,20244.0,20244.0,20244.0
mean,94114.061203,85.344892,57.263485,5.399081
std,9.350442,8.110656,35.014539,2.68182
min,94102.0,46.0,1.0,1.0
25%,94108.0,81.0,28.0,3.0
50%,94111.0,87.0,53.0,6.0
75%,94121.0,92.0,96.0,8.0
max,94134.0,100.0,117.0,10.0


In [5]:
months_num = {
   "January": 1,
   "February": 2,
   "March": 3,
   "April": 4,
   "May": 5,
   "June": 6,
   "July": 7,
   "August": 8,
   "September": 9,
   "October": 10,
   "November": 11,
   "December": 12,
}

In [6]:
X["months_num"] = X["inspection_date"].apply(lambda x: months_num[x])

In [7]:
months_num["June"]

6

In [8]:
X

Unnamed: 0,business_postal_code_1,inspection_id,inspection_date,inspection_score,violation_description,neighborhoods,Current Police Districts,population,avg_income,months_num
0,94133,835_20180917,September,88,Improper food storage,107,6,26827,"$40,990.00",9
1,94108,905_20190415,April,87,High risk vermin infestation,19,6,13716,"$31,542.00",4
2,94118,1203_20170803,August,77,Moderate risk food holding temperature,5,8,38939,"$61,609.00",8
3,94109,1345_20170928,September,81,Improper cooling methods,105,4,56322,"$43,444.00",9
4,94114,1352_20180620,June,74,Non service animal,38,3,30574,"$75,727.00",6
...,...,...,...,...,...,...,...,...,...,...
20239,94118,68998_20181026,October,94,Improper storage of equipment utensils or linens,5,8,38939,"$61,609.00",10
20240,94112,66584_20180430,April,90,Moderate risk food holding temperature,90,9,73104,"$57,629.00",4
20241,94103,67182_20170731,July,96,Inadequate warewashing facilities or equipment,53,3,23016,"$31,131.00",7
20242,94103,70090_20170105,January,93,Improper cooling methods,32,1,23016,"$31,131.00",1


In [9]:
X = X.drop(columns='inspection_date')
X.head()

Unnamed: 0,business_postal_code_1,inspection_id,inspection_score,violation_description,neighborhoods,Current Police Districts,population,avg_income,months_num
0,94133,835_20180917,88,Improper food storage,107,6,26827,"$40,990.00",9
1,94108,905_20190415,87,High risk vermin infestation,19,6,13716,"$31,542.00",4
2,94118,1203_20170803,77,Moderate risk food holding temperature,5,8,38939,"$61,609.00",8
3,94109,1345_20170928,81,Improper cooling methods,105,4,56322,"$43,444.00",9
4,94114,1352_20180620,74,Non service animal,38,3,30574,"$75,727.00",6


In [10]:
X_encoded = pd.get_dummies(X)
X_encoded

Unnamed: 0,business_postal_code_1,inspection_score,neighborhoods,Current Police Districts,months_num,inspection_id_1000_20171002,inspection_id_1000_20180523,inspection_id_1000_20190617,inspection_id_1002_20170928,inspection_id_1002_20180509,...,"avg_income_$61,609.00","avg_income_$61,776.00","avg_income_$63,983.00","avg_income_$66,627.00","avg_income_$75,727.00","avg_income_$76,044.00","avg_income_$80,959.00","avg_income_$84,710.00","avg_income_$88,976.00","avg_income_$95,313.00"
0,94133,88,107,6,9,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,94108,87,19,6,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,94118,77,5,8,8,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,94109,81,105,4,9,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,94114,74,38,3,6,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20239,94118,94,5,8,10,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
20240,94112,90,90,9,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20241,94103,96,53,3,7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20242,94103,93,32,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
data_scaler = StandardScaler()

In [12]:
X_encoded_scaled = data_scaler.fit_transform(X_encoded)
X_encoded_scaled[:1]

array([[ 2.02549393,  0.3273685 ,  1.42048832, ..., -0.1686698 ,
        -0.18869199, -0.10262856]])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y, random_state = 1)

In [14]:
y_train.value_counts()

Low Risk         7323
Moderate Risk    5724
High Risk        2136
Name: risk_category, dtype: int64

In [22]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [24]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [25]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [26]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1", "Actual 2"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1,Actual 2
Actual 0,687,1,1
Actual 1,0,2497,0
Actual 2,0,4,1871


In [27]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [28]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Actual 2
Actual 0,687,1,1
Actual 1,0,2497,0
Actual 2,0,4,1871


Accuracy Score : 0.998814463544754
Classification Report
               precision    recall  f1-score   support

    High Risk       1.00      1.00      1.00       689
     Low Risk       1.00      1.00      1.00      2497
Moderate Risk       1.00      1.00      1.00      1875

     accuracy                           1.00      5061
    macro avg       1.00      1.00      1.00      5061
 weighted avg       1.00      1.00      1.00      5061



In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X_encoded.columns), reverse=True)