# Random Forest Model with Mock Data

In [2]:
# import dependencies
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# load data
data = 'mock_data.csv'
mock_df = pd.read_csv(data)

# set unique County_FIPS_Code value as index
mock_df.set_index("County_FIPS_Code", inplace=True)

mock_df.head()

Unnamed: 0_level_0,Hesitancy_Level,SVI_Level,CVAC_Concern_Level,Mask_Requirement_Status
County_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,5,3,1
1,1,1,3,1
2,2,2,1,0
3,2,4,5,0
4,1,3,3,0


In [4]:
# define features set
X = mock_df.copy()
X = X.drop("Mask_Requirement_Status", axis=1)
X.head()

Unnamed: 0_level_0,Hesitancy_Level,SVI_Level,CVAC_Concern_Level
County_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,5,3
1,1,1,3
2,2,2,1
3,2,4,5
4,1,3,3


In [5]:
# define target set
y = mock_df["Mask_Requirement_Status"].ravel()
y[:5]

array([1, 1, 0, 0, 0], dtype=int64)

In [6]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# create a StandardScaler instance
scaler = StandardScaler()

# fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# create a random forest classifier
# n_estimator sets the # trees created (best practice 64 to 128).
# generally higher number --> stronger/stabler predictions
# however, higher training time allocated --> slower output
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [9]:
# fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# predict using testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0], dtype=int64)

In [19]:
# Model Evaluation:

# score the model on accuracy
acc_score = accuracy_score(y_test, predictions)

# calculate confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# display results
print("\n*****Mock Random Forest Model Evaluation*****")
print(f"Accuracy Score : {acc_score}")
print("---------------------------------------")
print("Confusion Matrix")
display(cm_df)
print("---------------------------------------")
print("Classification Report\n")
print(classification_report(y_test, predictions))


*****Mock Random Forest Model Evaluation*****
Accuracy Score : 0.476
---------------------------------------
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71,46
Actual 1,85,48


---------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.46      0.61      0.52       117
           1       0.51      0.36      0.42       133

    accuracy                           0.48       250
   macro avg       0.48      0.48      0.47       250
weighted avg       0.48      0.48      0.47       250



In [12]:
# for actual data: raise n_estimator to 500 to improve prediction
#rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [13]:
# fit/train the model with new n_estimator
#rf_model = rf_model.fit(X_train_scaled, y_train)

In [14]:
# predict using testing data
#predictions = rf_model.predict(X_test_scaled)
#predictions

In [15]:
# Model Evaluation:

# score the model on accuracy
#acc_score = accuracy_score(y_test, predictions)

# calculate confusion matrix
#cm = confusion_matrix(y_test, predictions)
#cm_df = pd.DataFrame(
#    cm, index=["Actual 0", "Actual 1"],
#    columns=["Predicted 0", "Predicted 1"]
#)

# display results
#print("\n*****Mock Random Forest Model Evaluation*****")
#print(f"Accuracy Score : {acc_score}")
#print("---------------------------------------")
#print("Confusion Matrix")
#display(cm_df)
#print("---------------------------------------")
#print("Classification Report\n")
#print(classification_report(y_test, predictions))

In [16]:
# calculate feature importance in the Random Forest model
importances = rf_model.feature_importances_
print(f"Importances : {importances}") #returns array of scores for X-test set; sum=1

Importances : [0.18566826 0.42666647 0.38766527]


In [17]:
# sort features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.42666646987225343, 'SVI_Level'),
 (0.38766526748356916, 'CVAC_Concern_Level'),
 (0.18566826264417752, 'Hesitancy_Level')]