# Resampling Techniques

In [1]:
import pandas as pd
pd.set_option("display.max_columns", 28)
import warnings
warnings.filterwarnings('ignore')

# Dependencies for interaction with database:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

# Machine Learning dependencies:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import numpy as np

In [2]:
# Create engine and link to AWS server database:
engine = create_engine('postgresql://postgres:Common123@database-1.cukfyvhxl6ur.us-west-1.rds.amazonaws.com:5432/postgres')
connect = engine.connect()

In [3]:
# Create session:
session = Session(engine)

In [4]:
# Import clean_dataset_2016 table:
fatal_ec_w_demog_df = pd.read_sql("SELECT * FROM fatal_ec_w_demog", connect)

In [5]:
fatal_ec_w_demog_df.head()

Unnamed: 0,unique_id,full_name,age,gender,race,race_w_imputation,imputation_probability,date_of_injury,location_of_injury,location_of_death_city,location_of_death_state,location_of_death_zip,location_of_death_county,latitude,longitude,agency_responsible_for_death,cause_of_death,disposition_exclusions,intentional_use_of_force,symptoms_of_mental_illness,year_of_injury,white,black,hispanic,asian,indian,hawaii,other
0,1,LaTanya Janelle McCoy,24.0,Female,African-American/Black,African-American/Black,,2000-01-02,5700 block Mack Road,Sacramento,CA,95823.0,Sacramento,38.473949,-121.433776,Sacramento Police Department,Vehicle,Unknown,Pursuit,No,2000,0.364,0.053,0.395,0.147,0.004,0.004,0.033
1,2,Lester Miller,53.0,Male,Race unspecified,African-American/Black,0.947676,2000-01-02,4850 Flakes Mill Road,Ellenwood,GA,30294.0,DeKalb,33.645164,-84.229413,DeKalb County Sheriff's Office,Gunshot,Criminal,Intentional use of deadly force,No,2000,0.52,0.313,0.098,0.041,0.002,0.001,0.025
2,3,Derrick E. Tate,23.0,Male,Race unspecified,European-American/White,0.941666,2000-01-05,1900 block W Reynolds St,Pontiac,IL,61764.0,Livingston,40.873687,-88.642806,Bloomington Police Department,Gunshot,Justified,Intentional use of deadly force,No,2000,0.609,0.136,0.176,0.056,0.001,0.0,0.022
3,4,John Edward Pittman,45.0,Male,African-American/Black,African-American/Black,,2000-01-05,,Dothan,AL,,Houston,31.223231,-85.390489,Dothan Police Department,Gunshot,Justified,Intentional use of deadly force,Unknown,2000,0.654,0.265,0.044,0.014,0.004,0.0,0.019
4,5,John Frank Brown,20.0,Male,African-American/Black,African-American/Black,,2000-01-05,56 Jesse Hill Jr Dr,Atlanta,GA,30303.0,Fulton,33.752703,-84.381198,Atlanta Police Department,Beaten/Bludgeoned with instrument,Accidental,Yes,Drug or alcohol use,2000,0.52,0.313,0.098,0.041,0.002,0.001,0.025


In [6]:
df = pd.read_sql("SELECT * FROM shootings_wp_w_demog", connect)

In [7]:
df.head()

Unnamed: 0,unique_id,full_name,date_of_death,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,white,black,hispanic,asian,indian,hawaii,other
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False,0.675,0.038,0.13,0.089,0.011,0.006,0.05
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False,0.75,0.018,0.134,0.047,0.009,0.003,0.04
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False,0.757,0.055,0.122,0.028,0.006,0.0,0.032
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False,0.364,0.053,0.395,0.147,0.004,0.004,0.033
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False,0.678,0.038,0.218,0.032,0.005,0.001,0.028


In [8]:
# Get dataset length
len(df)

5552

In [9]:
# Drop empty cells
df = df.dropna()

In [10]:
# Get unique values for 'armed'
df.armed.unique()

array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'vehicle',
       'shovel', 'hatchet', 'machete', 'box cutter', 'undetermined',
       'sword', 'hammer', 'metal object', 'screwdriver',
       'lawn mower blade', 'flagpole', 'guns and explosives',
       'cordless drill', 'metal pole', 'Taser', 'metal hand tool',
       'metal pipe', 'blunt object', 'metal stick', 'sharp object',
       'meat cleaver', 'carjack', 'chain', "contractor's level",
       'unknown weapon', 'stapler', 'crossbow', 'bean-bag gun',
       'baseball bat and fireplace poker', 'straight edge razor',
       'gun and knife', 'ax', 'brick', 'baseball bat', 'hand torch',
       'chain saw', 'garden tool', 'scissors', 'pole', 'pick-axe',
       'flashlight', 'spear', 'chair', 'pitchfork', 'hatchet and gun',
       'rock', 'piece of wood', 'bayonet', 'glass shard', 'pepper spray',
       'metal rake', 'baton', 'crowbar', 'oar', 'machete and gun',
       'air conditioner', 'pole and knife', 'beer bottle', 'pip

In [11]:
# Replace 'armed' values to be used in our model
df['armed'] = df['armed'].replace(['gun','nail gun', 'knife', 'vehicle', 'shovel', 'hatchet', 'machete', 'sword', 'hammer', 'metal object', 'screwdriver',
       'lawn mower blade', 'flagpole', 'guns and explosives',
       'cordless drill', 'crossbow', 'metal pole', 'Taser',
       'metal hand tool', 'metal pipe', 'blunt object', 'metal stick',
       'sharp object', 'meat cleaver', 'carjack', 'chain',
       "contractor's level", 'unknown weapon', 'beer bottle',
       'baseball bat and fireplace poker',
       'straight edge razor', 'gun and knife', 'ax', 'brick',
       'baseball bat', 'hand torch', 'chain saw', 'garden tool',
       'scissors', 'pole', 'pick-axe', 'baton', 'spear',
       'chair', 'pitchfork', 'hatchet and gun', 'rock', 'piece of wood',
       'bayonet', 'pipe', 'glass shard', 'motorcycle', 'pepper spray',
       'metal rake', 'crowbar', 'oar', 'machete and gun', 'tire iron',
       'pole and knife', 'baseball bat and bottle',
       'fireworks', 'chainsaw', 'gun and sword', 'gun and car',
       'pellet gun', 'claimed to be armed', 'BB gun', 'incendiary device',
       'bow and arrow', 'gun and vehicle', 'vehicle and gun',
       'samurai sword', 'walking stick', 'wrench', 'barstool', 'grenade',
       'BB gun and vehicle', 'Airsoft pistol', 'wasp spray', 'air pistol',
       'baseball bat and knife', 'vehicle and machete', 'ice pick',
       'car, knife and mace', 'bottle', 'box cutter'],'1')

In [12]:
# Get unique values for 'armed'
df.armed.unique()

array(['1', 'unarmed', 'toy weapon', 'undetermined', 'stapler',
       'bean-bag gun', 'flashlight', 'air conditioner', 'pen'],
      dtype=object)

In [13]:
df['armed'] = df['armed'].replace(['unarmed', 'toy weapon', 'undetermined', 'stapler',
       'bean-bag gun', 'flashlight', 'air conditioner', 'pen'], '0')

In [14]:
# Get unique values for 'armed'
df.armed.unique()

array(['1', '0'], dtype=object)

In [15]:
# Get unique values for 'gender'
df.gender.unique()

array(['M', 'F'], dtype=object)

In [16]:
# Replace 'gender' values to be used in our model
df['gender'] = df['gender'].replace(['M'], '1')

In [17]:
# Replace 'gender' values to be used in our model
df['gender'] = df['gender'].replace(['F'], '0')

In [18]:
# Get unique values for 'gender'
df.gender.unique()

array(['1', '0'], dtype=object)

In [19]:
# Get unique values for 'race'
df.race.unique()

array(['A', 'W', 'H', 'B', 'O', 'N'], dtype=object)

In [20]:
# Replace 'race' values to be used in our model
df['race'] = df['race'].replace(['A'], '1')
df['race'] = df['race'].replace(['W', 'H', 'B', 'O', 'N'], '0')

In [21]:
# Get unique values for 'race'
df.race.unique()

array(['1', '0'], dtype=object)

In [22]:
# Get unique values for 'signs of mental illness'
df.signs_of_mental_illness.unique()

array([ True, False])

In [23]:
# Replace 'signs of mental illness' values to be used in our model
df['signs_of_mental_illness'] = df['signs_of_mental_illness'].replace([ True], '1')
df['signs_of_mental_illness'] = df['signs_of_mental_illness'].replace([ False], '0')

In [24]:
# Get unique values for 'signs of mental illness'
df.signs_of_mental_illness.unique()

array(['1', '0'], dtype=object)

In [25]:
# Get unique values for 'threat level'
df.threat_level.unique()

array(['attack', 'other', 'undetermined'], dtype=object)

In [26]:
# Replace 'threat level' values to be used in our model

df['threat_level'] = df['threat_level'].replace(['attack', 'other'], '1')
df['threat_level'] = df['threat_level'].replace(['undetermined'], '0')

In [27]:
# Get unique values for 'threat level'
df.threat_level.unique()

array(['1', '0'], dtype=object)

In [28]:
# Get unique values for 'flee'
df.flee.unique()

array(['Not fleeing', 'Car', 'Foot', 'Other'], dtype=object)

In [29]:
# Replace 'flee' values to be used in our model

df['flee'] = df['flee'].replace(['Car', 'Foot', 'Other'], '1')
df['flee'] = df['flee'].replace(['Not fleeing'], '0')

In [30]:
# Get unique values for 'flee'
df.flee.unique()

array(['0', '1'], dtype=object)

In [31]:
# Get unique values for 'body camera'
df.body_camera.unique()

array([False,  True])

In [32]:
# Replace 'body camera' values to be used in our model

df['body_camera'] = df['body_camera'].replace([True], '0')
df['body_camera'] = df['body_camera'].replace([False], '1')

In [33]:
# Get unique values for 'body camera'
df.body_camera.unique()

array(['1', '0'], dtype=object)

# Build our Logistic Regression Model

# Split the Data into Training and Testing

In [34]:
df.head(10)

Unnamed: 0,unique_id,full_name,date_of_death,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,white,black,hispanic,asian,indian,hawaii,other
0,3,Tim Elliot,2015-01-02,shot,1,53.0,1,1,Shelton,WA,1,1,0,1,0.675,0.038,0.13,0.089,0.011,0.006,0.05
1,4,Lewis Lee Lembke,2015-01-02,shot,1,47.0,1,0,Aloha,OR,0,1,0,1,0.75,0.018,0.134,0.047,0.009,0.003,0.04
2,5,John Paul Quintero,2015-01-03,shot and Tasered,0,23.0,1,0,Wichita,KS,0,1,0,1,0.757,0.055,0.122,0.028,0.006,0.0,0.032
3,8,Matthew Hoffman,2015-01-04,shot,0,32.0,1,0,San Francisco,CA,1,1,0,1,0.364,0.053,0.395,0.147,0.004,0.004,0.033
4,9,Michael Rodriguez,2015-01-04,shot,1,39.0,1,0,Evans,CO,0,1,0,1,0.678,0.038,0.218,0.032,0.005,0.001,0.028
5,11,Kenneth Joe Brown,2015-01-04,shot,1,18.0,1,0,Guthrie,OK,0,1,0,1,0.65,0.069,0.111,0.021,0.077,0.001,0.071
6,13,Kenneth Arnold Buck,2015-01-05,shot,1,22.0,1,0,Chandler,AZ,0,1,1,1,0.542,0.043,0.318,0.033,0.039,0.002,0.024
7,15,Brock Nichols,2015-01-06,shot,1,35.0,1,0,Assaria,KS,0,1,0,1,0.757,0.055,0.122,0.028,0.006,0.0,0.032
8,16,Autumn Steele,2015-01-06,shot,0,34.0,0,0,Burlington,IA,0,1,0,0,0.853,0.039,0.063,0.023,0.003,0.0,0.019
9,17,Leslie Sapp III,2015-01-06,shot,0,47.0,1,0,Knoxville,PA,0,1,0,1,0.76,0.104,0.078,0.035,0.001,0.0,0.023


In [35]:
# Create our features
X = df.drop(columns=["race", "unique_id", "full_name", "date_of_death", "manner_of_death", "city", "state", "white", "black", "hispanic", "asian", "indian", "hawaii", "other"])

# Create our target
y = df["race"]

In [36]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [37]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=300,
   random_state=1)

In [38]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [39]:
y_pred = classifier.predict(X_test)

In [40]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9811827956989247


In [41]:
# Check accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9811827956989247

In [42]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1095
           1       0.00      0.00      0.00        21

    accuracy                           0.98      1116
   macro avg       0.49      0.50      0.50      1116
weighted avg       0.96      0.98      0.97      1116



# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

View the count of the target classes using Counter from the collections library.
Use the resampled data to train a logistic regression model.
Calculate the balanced accuracy score from sklearn.metrics.
Print the confusion matrix from sklearn.metrics.
Generate a classication report using the imbalanced_classification_report from imbalanced-learn.
Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

## Naive Random Oversampling

In [43]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [44]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [45]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6130463144161774

In [46]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[404, 691],
       [  3,  18]], dtype=int64)

In [47]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.37      0.86      0.54      0.56      0.30      1095
          1       0.03      0.86      0.37      0.05      0.56      0.33        21

avg / total       0.97      0.38      0.85      0.53      0.56      0.30      1116



### SMOTE Oversampling

In [48]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [49]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [50]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.5969993476842792

In [51]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[421, 674],
       [  4,  17]], dtype=int64)

In [52]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.38      0.81      0.55      0.56      0.30      1095
          1       0.02      0.81      0.38      0.05      0.56      0.32        21

avg / total       0.97      0.39      0.80      0.54      0.56      0.30      1116



### Balanced Random Forest Classifier

In [53]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=28)
rf_model = rf_model.fit(X_train, y_train)

In [54]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [55]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1091,4
Actual Other,21,0


In [56]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1091,4
Actual Other,21,0


Accuracy Score : 0.49817351598173515
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00      1095
          1       0.00      0.00      1.00      0.00      0.00      0.00        21

avg / total       0.96      0.98      0.02      0.97      0.00      0.00      1116



In [57]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.8036702695280683, 'age'),
 (0.05353136661709612, 'signs_of_mental_illness'),
 (0.05326233515621224, 'body_camera'),
 (0.03797912245261145, 'armed'),
 (0.03377889213045265, 'flee'),
 (0.011646908242777679, 'gender'),
 (0.006131105872781384, 'threat_level')]

### Easy Ensemble AdaBoost Classifier

In [58]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=28)
ee_model = ee_model.fit(X_train, y_train)

In [59]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [60]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,534,561
Actual Other,5,16


In [61]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,534,561
Actual Other,5,16


Accuracy Score : 0.6247879973907371
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.49      0.76      0.65      0.61      0.36      1095
          1       0.03      0.76      0.49      0.05      0.61      0.38        21

avg / total       0.97      0.49      0.76      0.64      0.61      0.36      1116



# Additional Tests on Individual Variables

In [62]:
df.head()

Unnamed: 0,unique_id,full_name,date_of_death,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,white,black,hispanic,asian,indian,hawaii,other
0,3,Tim Elliot,2015-01-02,shot,1,53.0,1,1,Shelton,WA,1,1,0,1,0.675,0.038,0.13,0.089,0.011,0.006,0.05
1,4,Lewis Lee Lembke,2015-01-02,shot,1,47.0,1,0,Aloha,OR,0,1,0,1,0.75,0.018,0.134,0.047,0.009,0.003,0.04
2,5,John Paul Quintero,2015-01-03,shot and Tasered,0,23.0,1,0,Wichita,KS,0,1,0,1,0.757,0.055,0.122,0.028,0.006,0.0,0.032
3,8,Matthew Hoffman,2015-01-04,shot,0,32.0,1,0,San Francisco,CA,1,1,0,1,0.364,0.053,0.395,0.147,0.004,0.004,0.033
4,9,Michael Rodriguez,2015-01-04,shot,1,39.0,1,0,Evans,CO,0,1,0,1,0.678,0.038,0.218,0.032,0.005,0.001,0.028


# Test for if individual was armed

In [63]:
# Test for if individual was armed

# Create our features
X = df.drop(columns=["race", "unique_id", "full_name", "age", "flee", "gender", "signs_of_mental_illness", "threat_level", "body_camera", "date_of_death", "manner_of_death", "city", "state", "white", "black", "hispanic", "asian", "indian", "hawaii", "other"])

# Create our target
y = df["race"]

In [64]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [65]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=300,
   random_state=1)

In [66]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [67]:
y_pred = classifier.predict(X_test)

In [68]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9811827956989247


In [69]:
# Check accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9811827956989247

In [70]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1095
           1       0.00      0.00      0.00        21

    accuracy                           0.98      1116
   macro avg       0.49      0.50      0.50      1116
weighted avg       0.96      0.98      0.97      1116



In [71]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [72]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [73]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.4998043052837573

In [74]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[156, 939],
       [  3,  18]], dtype=int64)

In [75]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.14      0.86      0.25      0.35      0.11      1095
          1       0.02      0.86      0.14      0.04      0.35      0.13        21

avg / total       0.96      0.16      0.84      0.24      0.35      0.11      1116



### SMOTE Oversampling

In [76]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [77]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [78]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.4998043052837573

In [79]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[156, 939],
       [  3,  18]], dtype=int64)

In [80]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.14      0.86      0.25      0.35      0.11      1095
          1       0.02      0.86      0.14      0.04      0.35      0.13        21

avg / total       0.96      0.16      0.84      0.24      0.35      0.11      1116



### Balanced Random Forest Classifier

In [81]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=28)
rf_model = rf_model.fit(X_train, y_train)

In [82]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [83]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


In [84]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


Accuracy Score : 0.5
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00      1095
          1       0.00      0.00      1.00      0.00      0.00      0.00        21

avg / total       0.96      0.98      0.02      0.97      0.00      0.00      1116



In [85]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(1.0, 'armed')]

### Easy Ensemble AdaBoost Classifier

In [86]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=28)
ee_model = ee_model.fit(X_train, y_train)

In [87]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [88]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,156,939
Actual Other,3,18


In [89]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,156,939
Actual Other,3,18


Accuracy Score : 0.4998043052837573
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.14      0.86      0.25      0.35      0.11      1095
          1       0.02      0.86      0.14      0.04      0.35      0.13        21

avg / total       0.96      0.16      0.84      0.24      0.35      0.11      1116



# Test for if Individual Was Fleeing

In [90]:
# Test for if individual was fleeing

# Create our features
X = df.drop(columns=["race", "unique_id", "full_name", "age", "armed", "gender", "signs_of_mental_illness", "threat_level", "body_camera", "date_of_death", "manner_of_death", "city", "state", "white", "black", "hispanic", "asian", "indian", "hawaii", "other"])

# Create our target
y = df["race"]

In [91]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [92]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=300,
   random_state=1)

In [93]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [94]:
y_pred = classifier.predict(X_test)

In [95]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9811827956989247


In [96]:
# Check accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9811827956989247

In [97]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1095
           1       0.00      0.00      0.00        21

    accuracy                           0.98      1116
   macro avg       0.49      0.50      0.50      1116
weighted avg       0.96      0.98      0.97      1116



In [98]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [99]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [100]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5993476842791912

In [101]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[374, 721],
       [  3,  18]], dtype=int64)

In [102]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.34      0.86      0.51      0.54      0.28      1095
          1       0.02      0.86      0.34      0.05      0.54      0.31        21

avg / total       0.97      0.35      0.85      0.50      0.54      0.28      1116



### SMOTE Oversampling

In [103]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [104]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [105]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.5993476842791912

In [106]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[374, 721],
       [  3,  18]], dtype=int64)

In [107]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.34      0.86      0.51      0.54      0.28      1095
          1       0.02      0.86      0.34      0.05      0.54      0.31        21

avg / total       0.97      0.35      0.85      0.50      0.54      0.28      1116



### Balanced Random Forest Classifier

In [108]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=28)
rf_model = rf_model.fit(X_train, y_train)

In [109]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [110]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


In [111]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


Accuracy Score : 0.5
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00      1095
          1       0.00      0.00      1.00      0.00      0.00      0.00        21

avg / total       0.96      0.98      0.02      0.97      0.00      0.00      1116



In [112]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(1.0, 'flee')]

### Easy Ensemble AdaBoost Classifier¶

In [113]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=28)
ee_model = ee_model.fit(X_train, y_train)

In [114]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [115]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,374,721
Actual Other,3,18


In [116]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,374,721
Actual Other,3,18


Accuracy Score : 0.5993476842791912
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.34      0.86      0.51      0.54      0.28      1095
          1       0.02      0.86      0.34      0.05      0.54      0.31        21

avg / total       0.97      0.35      0.85      0.50      0.54      0.28      1116



# Test for Individual's Gender

In [117]:
# Test for if individual's Gender

# Create our features
X = df.drop(columns=["race", "unique_id", "full_name", "age", "flee", "armed", "signs_of_mental_illness", "threat_level", "body_camera", "date_of_death", "manner_of_death", "city", "state", "white", "black", "hispanic", "asian", "indian", "hawaii", "other"])

# Create our target
y = df["race"]

In [118]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [119]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=300,
   random_state=1)

In [120]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [121]:
y_pred = classifier.predict(X_test)

In [122]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9811827956989247


In [123]:
# Check accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9811827956989247

In [124]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1095
           1       0.00      0.00      0.00        21

    accuracy                           0.98      1116
   macro avg       0.49      0.50      0.50      1116
weighted avg       0.96      0.98      0.97      1116



In [125]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [126]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [127]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.4991519895629485

In [128]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1041,   54],
       [  20,    1]], dtype=int64)

In [129]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.95      0.05      0.97      0.21      0.05      1095
          1       0.02      0.05      0.95      0.03      0.21      0.04        21

avg / total       0.96      0.93      0.06      0.95      0.21      0.05      1116



### SMOTE Oversampling

In [130]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [131]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [132]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.5008480104370515

In [133]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  54, 1041],
       [   1,   20]], dtype=int64)

In [134]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.05      0.95      0.09      0.22      0.04      1095
          1       0.02      0.95      0.05      0.04      0.22      0.05        21

avg / total       0.96      0.07      0.94      0.09      0.22      0.04      1116



### Balanced Random Forest Classifier

In [135]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=28)
rf_model = rf_model.fit(X_train, y_train)

In [136]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [137]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


In [138]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


Accuracy Score : 0.5
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00      1095
          1       0.00      0.00      1.00      0.00      0.00      0.00        21

avg / total       0.96      0.98      0.02      0.97      0.00      0.00      1116



In [139]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(1.0, 'gender')]

### Easy Ensemble AdaBoost Classifier¶

In [140]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=28)
ee_model = ee_model.fit(X_train, y_train)

In [141]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [142]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,0,1095
Actual Other,0,21


In [143]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,0,1095
Actual Other,0,21


Accuracy Score : 0.5
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00      1095
          1       0.02      1.00      0.00      0.04      0.00      0.00        21

avg / total       0.00      0.02      0.98      0.00      0.00      0.00      1116



# Test for if Police Were Wearing Body Cameras

In [144]:
# Test for if Police were wearing body cameras

# Create our features
X = df.drop(columns=["race", "unique_id", "full_name", "age", "flee", "armed", "signs_of_mental_illness", "threat_level", "gender", "date_of_death", "manner_of_death", "city", "state", "white", "black", "hispanic", "asian", "indian", "hawaii", "other"])

# Create our target
y = df["race"]

In [145]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [146]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=300,
   random_state=1)

In [147]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [148]:
y_pred = classifier.predict(X_test)

In [149]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9811827956989247


In [150]:
# Check accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9811827956989247

In [151]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1095
           1       0.00      0.00      0.00        21

    accuracy                           0.98      1116
   macro avg       0.49      0.50      0.50      1116
weighted avg       0.96      0.98      0.97      1116



In [152]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [153]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [154]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5102413568166992

In [155]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[961, 134],
       [ 18,   3]], dtype=int64)

In [156]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.88      0.14      0.93      0.35      0.13      1095
          1       0.02      0.14      0.88      0.04      0.35      0.12        21

avg / total       0.96      0.86      0.16      0.91      0.35      0.13      1116



### SMOTE Oversampling

In [157]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [158]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [159]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.5102413568166992

In [160]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[961, 134],
       [ 18,   3]], dtype=int64)

In [161]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.88      0.14      0.93      0.35      0.13      1095
          1       0.02      0.14      0.88      0.04      0.35      0.12        21

avg / total       0.96      0.86      0.16      0.91      0.35      0.13      1116



### Balanced Random Forest Classifier

In [162]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=28)
rf_model = rf_model.fit(X_train, y_train)

In [163]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [164]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


In [165]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


Accuracy Score : 0.5
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00      1095
          1       0.00      0.00      1.00      0.00      0.00      0.00        21

avg / total       0.96      0.98      0.02      0.97      0.00      0.00      1116



In [166]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(1.0, 'body_camera')]

### Easy Ensemble AdaBoost Classifier¶

In [167]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=28)
ee_model = ee_model.fit(X_train, y_train)

In [168]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [169]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,961,134
Actual Other,18,3


In [170]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,961,134
Actual Other,18,3


Accuracy Score : 0.5102413568166992
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.88      0.14      0.93      0.35      0.13      1095
          1       0.02      0.14      0.88      0.04      0.35      0.12        21

avg / total       0.96      0.86      0.16      0.91      0.35      0.13      1116



# Test for Signs of Mental Illness

In [171]:
# Test for signs of mental illness

# Create our features
X = df.drop(columns=["race", "unique_id", "full_name", "age", "flee", "armed", "gender", "threat_level", "body_camera", "date_of_death", "manner_of_death", "city", "state", "white", "black", "hispanic", "asian", "indian", "hawaii", "other"])

# Create our target
y = df["race"]

In [172]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [173]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=300,
   random_state=1)

In [174]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [175]:
y_pred = classifier.predict(X_test)

In [176]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9811827956989247


In [177]:
# Check accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9811827956989247

In [178]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1095
           1       0.00      0.00      0.00        21

    accuracy                           0.98      1116
   macro avg       0.49      0.50      0.50      1116
weighted avg       0.96      0.98      0.97      1116



In [179]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [180]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [181]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5525114155251142

In [182]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[845, 250],
       [ 14,   7]], dtype=int64)

In [183]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.77      0.33      0.86      0.51      0.27      1095
          1       0.03      0.33      0.77      0.05      0.51      0.25        21

avg / total       0.97      0.76      0.34      0.85      0.51      0.27      1116



### SMOTE Oversampling

In [184]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [185]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [186]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.5525114155251142

In [187]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[845, 250],
       [ 14,   7]], dtype=int64)

In [188]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.77      0.33      0.86      0.51      0.27      1095
          1       0.03      0.33      0.77      0.05      0.51      0.25        21

avg / total       0.97      0.76      0.34      0.85      0.51      0.27      1116



### Balanced Random Forest Classifier

In [189]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=28)
rf_model = rf_model.fit(X_train, y_train)

In [190]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [191]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


In [192]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


Accuracy Score : 0.5
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00      1095
          1       0.00      0.00      1.00      0.00      0.00      0.00        21

avg / total       0.96      0.98      0.02      0.97      0.00      0.00      1116



In [193]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(1.0, 'signs_of_mental_illness')]

### Easy Ensemble AdaBoost Classifier¶

In [194]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=28)
ee_model = ee_model.fit(X_train, y_train)

In [195]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [196]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,845,250
Actual Other,14,7


In [197]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,845,250
Actual Other,14,7


Accuracy Score : 0.5525114155251142
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.77      0.33      0.86      0.51      0.27      1095
          1       0.03      0.33      0.77      0.05      0.51      0.25        21

avg / total       0.97      0.76      0.34      0.85      0.51      0.27      1116



# Test for Threat Level

In [198]:
# Test for threat level

# Create our features
X = df.drop(columns=["race", "unique_id", "full_name", "age", "flee", "armed", "signs_of_mental_illness", "gender", "body_camera", "date_of_death", "manner_of_death", "city", "state", "white", "black", "hispanic", "asian", "indian", "hawaii", "other"])

# Create our target
y = df["race"]

In [199]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [200]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=300,
   random_state=1)

In [201]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [202]:
y_pred = classifier.predict(X_test)

In [203]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9811827956989247


In [204]:
# Check accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9811827956989247

In [205]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1095
           1       0.00      0.00      0.00        21

    accuracy                           0.98      1116
   macro avg       0.49      0.50      0.50      1116
weighted avg       0.96      0.98      0.97      1116



In [206]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [207]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [208]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5196347031963471

In [209]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  43, 1052],
       [   0,   21]], dtype=int64)

In [210]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.04      1.00      0.08      0.20      0.04      1095
          1       0.02      1.00      0.04      0.04      0.20      0.04        21

avg / total       0.98      0.06      0.98      0.07      0.20      0.04      1116



### SMOTE Oversampling

In [211]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [212]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [213]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.5196347031963471

In [214]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  43, 1052],
       [   0,   21]], dtype=int64)

In [215]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.04      1.00      0.08      0.20      0.04      1095
          1       0.02      1.00      0.04      0.04      0.20      0.04        21

avg / total       0.98      0.06      0.98      0.07      0.20      0.04      1116



### Balanced Random Forest Classifier

In [216]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=28)
rf_model = rf_model.fit(X_train, y_train)

In [217]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [218]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


In [219]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


Accuracy Score : 0.5
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00      1095
          1       0.00      0.00      1.00      0.00      0.00      0.00        21

avg / total       0.96      0.98      0.02      0.97      0.00      0.00      1116



In [220]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(1.0, 'threat_level')]

### Easy Ensemble AdaBoost Classifier¶

In [221]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=28)
ee_model = ee_model.fit(X_train, y_train)

In [222]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [223]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,43,1052
Actual Other,0,21


In [224]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,43,1052
Actual Other,0,21


Accuracy Score : 0.5196347031963471
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.04      1.00      0.08      0.20      0.04      1095
          1       0.02      1.00      0.04      0.04      0.20      0.04        21

avg / total       0.98      0.06      0.98      0.07      0.20      0.04      1116



# Test for Age

In [225]:
# Test for Age

# Create our features
X = df.drop(columns=["race", "unique_id", "full_name", "gender", "flee", "armed", "signs_of_mental_illness", "threat_level", "body_camera", "date_of_death", "manner_of_death", "city", "state", "white", "black", "hispanic", "asian", "indian", "hawaii", "other"])

# Create our target
y = df["race"]

In [226]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y, random_state=1, stratify=y)

In [227]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=300,
   random_state=1)

In [228]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=300, random_state=1)

In [229]:
y_pred = classifier.predict(X_test)

In [230]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9811827956989247


In [231]:
# Check accuracy

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9811827956989247

In [232]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1095
           1       0.00      0.00      0.00        21

    accuracy                           0.98      1116
   macro avg       0.49      0.50      0.50      1116
weighted avg       0.96      0.98      0.97      1116



In [233]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [234]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [235]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.49804305283757333

In [236]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[465, 630],
       [  9,  12]], dtype=int64)

In [237]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.42      0.57      0.59      0.49      0.24      1095
          1       0.02      0.57      0.42      0.04      0.49      0.25        21

avg / total       0.96      0.43      0.57      0.58      0.49      0.24      1116



### SMOTE Oversampling

In [238]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

Counter({'0': 3283, '1': 3283})

In [239]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [240]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.5019569471624266

In [241]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[630, 465],
       [ 12,   9]], dtype=int64)

In [242]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.58      0.43      0.73      0.50      0.25      1095
          1       0.02      0.43      0.58      0.04      0.50      0.24        21

avg / total       0.96      0.57      0.43      0.71      0.50      0.25      1116



### Balanced Random Forest Classifier

In [243]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=28)
rf_model = rf_model.fit(X_train, y_train)

In [244]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [245]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


In [246]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Random Forest Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Random Forest Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,1095,0
Actual Other,21,0


Accuracy Score : 0.5
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      1.00      0.00      0.99      0.00      0.00      1095
          1       0.00      0.00      1.00      0.00      0.00      0.00        21

avg / total       0.96      0.98      0.02      0.97      0.00      0.00      1116



In [247]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(1.0, 'age')]

### Easy Ensemble AdaBoost Classifier¶

In [248]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=28)
ee_model = ee_model.fit(X_train, y_train)

In [249]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)

In [250]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual African American", "Actual Other"], columns=["Predicted African American", "Predicted Other"])

cm_df

Unnamed: 0,Predicted African American,Predicted Other
Actual African American,460,635
Actual Other,7,14


In [251]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

# Displaying results
print("Easy Ensemble Analysis")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble Analysis
Confusion Matrix


Unnamed: 0,Predicted African American,Predicted Other
Actual African American,460,635
Actual Other,7,14


Accuracy Score : 0.54337899543379
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.42      0.67      0.59      0.53      0.27      1095
          1       0.02      0.67      0.42      0.04      0.53      0.29        21

avg / total       0.97      0.42      0.66      0.58      0.53      0.27      1116

