  ### Gradient Boosted Tree with Mock Data

In [1]:
# import dependencies
import pandas as pd
from path import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
# load data
data = 'mock_data.csv'
mock_df = pd.read_csv(data)

# set unique County_FIPS_Code value as index
mock_df.set_index("County_FIPS_Code", inplace=True)

mock_df.head()

Unnamed: 0_level_0,Hesitancy_Level,SVI_Level,CVAC_Concern_Level,Mask_Requirement_Status
County_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,5,3,1
1,1,1,3,1
2,2,2,1,0
3,2,4,5,0
4,1,3,3,0


In [3]:
# define features set
X = mock_df.copy()
X = X.drop("Mask_Requirement_Status", axis=1)
X.head()

Unnamed: 0_level_0,Hesitancy_Level,SVI_Level,CVAC_Concern_Level
County_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,5,3
1,1,1,3
2,2,2,1
3,2,4,5
4,1,3,3


In [4]:
# define target vector
y = mock_df["Mask_Requirement_Status"].values
y[:5]

array([1, 1, 0, 0, 0], dtype=int64)

In [5]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1)

In [6]:
# create StandardScaler instance
scaler = StandardScaler()

# fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# display learning rates and accuracies:
print("\n*****Learning Rates & Accuracies*****\n")

# create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=3,
                                            max_depth=3,
                                            random_state=0)

    # fit/train the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # score model on accuracy
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()


*****Learning Rates & Accuracies*****

Learning rate:  0.05
Accuracy score (training): 0.561
Accuracy score (validation): 0.496

Learning rate:  0.1
Accuracy score (training): 0.568
Accuracy score (validation): 0.508

Learning rate:  0.25
Accuracy score (training): 0.587
Accuracy score (validation): 0.504

Learning rate:  0.5
Accuracy score (training): 0.600
Accuracy score (validation): 0.492

Learning rate:  0.75
Accuracy score (training): 0.608
Accuracy score (validation): 0.460

Learning rate:  1
Accuracy score (training): 0.605
Accuracy score (validation): 0.472



In [8]:
# choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.1,
                                        max_features=3,
                                        max_depth=3,
                                        random_state=0)

# fit/train the model
classifier.fit(X_train_scaled, y_train)

# predict using testing data
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,0
3,0,1
4,1,1
5,0,0
6,0,0
7,0,1
8,0,1
9,0,1


In [9]:
# Model Evaluation:

# score the model on accuracy
acc_score = accuracy_score(y_test, predictions)

# calculate confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# display results
print("\n*****Gradient Boosting Model Evaluation*****\n")
print(f"Accuracy Score : {acc_score}")
print("---------------------------------------")
print("Confusion Matrix")
display(cm_df)
print("---------------------------------------")
print("Classification Report\n")
print(classification_report(y_test, predictions))


*****Gradient Boosting Model Evaluation*****

Accuracy Score : 0.508
---------------------------------------
Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,68,58
Actual 1,65,59


---------------------------------------
Classification Report

              precision    recall  f1-score   support

           0       0.51      0.54      0.53       126
           1       0.50      0.48      0.49       124

    accuracy                           0.51       250
   macro avg       0.51      0.51      0.51       250
weighted avg       0.51      0.51      0.51       250

