## Import Libraries

In [2]:
import boto3
import sagemaker
import time
import io
from sklearn.neighbors import KNeighborsClassifier
from time import gmtime, strftime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
!pip install xgboost
import xgboost as xgb
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, plot_confusion_matrix, accuracy_score
import pandas as pd
import numpy as np
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect
from tqdm import tqdm  

[0m

## Setup SageMaker Boto3 Connection

In [3]:
region = boto3.Session().region_name
session = boto3.session.Session()

ec2 = boto3.Session().client(service_name="ec2", region_name=region)
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [4]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

In [5]:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

Role name: LabRole


In [6]:
sess = sagemaker.Session()
bucket = '{}'.format(bucket)
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket

'sagemaker-us-east-1-458903497716'

In [7]:
# Assign database name
database_name = "ads508"

In [8]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [9]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [10]:
# Create new database 'ads508'
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
create_db = pd.read_sql(statement, conn)
create_db

In [11]:
# Verify database creation
q = "SHOW DATABASES"
db_show = pd.read_sql(q, conn)
db_show

Unnamed: 0,database_name
0,ads508
1,default
2,dsoaws
3,sagemaker_featurestore


In [12]:
# Set Athena parameters
database_name = "ads508"
model_table_name_csv = "modeling"
model_s3_path = "s3://{}/modeling_data".format(bucket)
model_s3_path="s3://sagemaker-studio-458903497716-h2kl4ff3dz/modeling/data_for_modeling.csv"
model_s3_path ="https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/data_for_modeling.csv"
print(model_s3_path)

https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/data_for_modeling.csv


In [13]:
#df = pd.read_csv("{}/data_for_modeling.csv".format(model_s3_path))
#df = pd.read_csv("s3://sagemaker-studio-458903497716-h2kl4ff3dz/modeling/data_for_modeling.csv")
df = pd.read_csv(model_s3_path)
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,latitude,longitude,n_guns_involved,target_class,group_Democrat,suspect_age,ohe_drug,ohe_officer,ohe_gang,ohe_accident,...,suspect_age_group_Adult,suspect_age_group_Senior,region_East South Central,region_Middle Atlantic,region_Mountain,region_New England,region_Pacific,region_South Atlantic,region_West North Central,region_West South Central
0,40.3467,-79.8559,1.0,1,1,Adult 18+,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,40.4555,-79.897,1.0,1,1,,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,33.909,-118.333,1.0,1,1,,0,0,1,1,...,0,0,0,0,0,0,1,0,0,0
3,33.8447,-118.307,1.0,1,1,Adult 18+,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,33.9454,-118.399,1.0,1,1,,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [14]:
# Drop suspect age
df = df.drop(columns=['suspect_age'])
# Drop all rows containing nan
df = df.dropna()

In [15]:
# drop all rows containing nan
df_knn = df.dropna()

### Splitting the train, test, and validation first so that when the balancing takes place next, the validation and test datasets are not affected.

In [16]:
from sklearn.model_selection import train_test_split

# Splitting all data into 90% train and 10% holdout
df_train, df_holdout = train_test_split(
        df,
        test_size=0.10,
    stratify=df['target_class'])

# Splitting holdout data into 50% validation and 50% test
df_validation, df_test = train_test_split(
        df_holdout,
        test_size=0.50,
        stratify=df_holdout['target_class'])

### Creating files for each subsection of the data: train, test, and output

In [17]:
# specifying the output file path
df_train_output = "../generated_data/df_train.csv"
df_test_output = "../generated_data/df_test.csv"
df_validation_output = "../generated_data/df_validation.csv"

# saving the DataFrame to a CSV file
df_train.to_csv(df_train_output, index=False)
df_test.to_csv(df_test_output, index=False)
df_validation.to_csv(df_validation_output, index=False)


### Displaying the initial count of the target class variable in the training dataset.
We find that the class of 1 - which indicates someone was either injured or killed - is the majority class.

In [18]:
# count the number of 0s and 1s in the 'outcome' column
value_counts = df_train['target_class'].value_counts()

# print the results
print(value_counts)

1    118400
0     79909
Name: target_class, dtype: int64


### Undersampling the majority target_class of '1' 

In [19]:
df_grouped_by = df_train.groupby(["target_class"])
df_balanced = df_grouped_by.apply(
    lambda x: x.sample(df_grouped_by.size().min())\
    .reset_index(drop=True)
)

In [20]:
# showing the balanced value counts of the newly created dataframe
value_counts_balanced = df_balanced['target_class'].value_counts()

# print the results
print(value_counts_balanced)

0    79909
1    79909
Name: target_class, dtype: int64


# Baseline Model 

In [38]:
df_test['target_class'].value_counts()

1    6578
0    4440
Name: target_class, dtype: int64

In [46]:
import pandas as pd
from sklearn.metrics import accuracy_score

## All Negative Model -- predict 0 for every instance
# Create a DataFrame with the target column containing only 0s
df_test_negative = pd.DataFrame({'target_class': [0]*len(df_test)})

# Calculate the accuracy score by comparing the predicted values with the actual target values
accuracy_neg = accuracy_score(df_test['target_class'], df_test_negative['target_class'])

## All Positive Model -- predict 1 for every instance
df_test_positive = pd.DataFrame({'target_class': [1]*len(df_test)})
accuracy_pos = accuracy_score(df_test['target_class'], df_test_positive['target_class'])
    
# Print the accuracy score 
print("Accuracy of All Negative Model: {:.2f}%".format(accuracy_neg*100))
print("Accuracy of All Positive Model: {:.2f}%".format(accuracy_pos*100))

Accuracy of All Negative Model: 40.30%
Accuracy of All Positive Model: 59.70%


## Neural Network

In [21]:
# Splitting all data into 90% train and 10% holdout
df_knn_train, df_knn_holdout = train_test_split(
        df_knn,
        test_size=0.10,
    stratify=df_knn['target_class'])

# Splitting holdout data into 50% validation and 50% test
df_knn_validation, df_knn_test = train_test_split(
        df_knn_holdout,
        test_size=0.50,
        stratify=df_knn_holdout['target_class'])

df_knn_grouped_by = df_knn_train.groupby(["target_class"])
df_knn_balanced = df_knn_grouped_by.apply(
    lambda x: x.sample(df_knn_grouped_by.size().min())\
    .reset_index(drop=True)
)

In [None]:
# Transfer Training Data to S3 bucket - exclude header and index
s3_client = boto3.client("s3")
BUCKET='sagemaker-us-east-1-898900188658'
BUCKET='sagemaker-us-east-1-346023323361'
KEY='knn/train/train.csv'
#response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

df_knn_y_train = df_knn_train['target_class']
df_knn_features = df_knn_train.drop(columns=['target_class'])
# df_knn_features = df_knn_features.drop(columns=['suspect_age'])

# rearrange before uploading for AWS training job format
df_knn_train = df_knn_train[['target_class', 'latitude', 'longitude', 'n_guns_involved', 'group_Democrat', 'ohe_drug', 'ohe_officer', 'ohe_gang',
       'ohe_accident', 'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing', 'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc', 'ohe_drugs', 'ohe_car_jacking',
       'ohe_defensive', 'ohe_robbery', 'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass', 'ohe_domestic', 'suspect_age_group_Teen', 'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior', 'region_East South Central', 'region_Middle Atlantic', 'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central', 'region_West South Central']]

#df_y_test = df_test['target_class']
#df_test = df_test.drop(columns=['target_class'])
df_knn_test = df_knn_test.drop(columns=['suspect_age'])
df_knn_y_test = df_knn_test['target_class']
df_knn_test = df_knn_test.drop(columns=['target_class'])

with io.StringIO() as csv_buffer:
    df_knn_train.to_csv(csv_buffer, index=False, header=False)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )

In [None]:
print(df_knn_features.shape)
print(df_knn_y_train.shape)
#print(df_features.columns)
#print(df_features.head(5))
#df_knn_features = df_knn_features.drop(columns=['suspect_age'])

In [None]:
parameters = {'solver':('adam', 'sgd'), 'activation':('relu', 'tanh'), 
              'hidden_layer_sizes':[2, 4], 'max_iter': [200, 400]} 
nn = GridSearchCV(MLPClassifier(), parameters) #finetune t find best parameters
nn.fit(df_knn_features, df_knn_y_train)

In [None]:
nn.best_params_ #best parameters

In [None]:
nn.best_score_ #best score

In [None]:
y = df_knn_validation['target_class']
X = df_knn_validation.drop(columns=['target_class', 'suspect_age'])

y_pred = nn.best_estimator_.predict(X)
#print(confusion_matrix(y, y_pred))
acc_nn  = accuracy_score(y, y_pred)
print(acc_nn)

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(df_knn_features, df_knn_y_train)

In [None]:
print(knn.predict(df_knn_test))
print(knn.score(df_knn_test, df_knn_y_test))

In [None]:
y = df_knn_validation['target_class']
#X = df_knn_validation.drop(columns=['target_class', 'suspect_age'])
#print(confusion_matrix(y, y_pred))
acc_knn = accuracy_score(y, y_pred)
print(acc_knn)

# XGBoost

In [None]:
# Separate features and labels for each set
X_train, y_train = df_train.drop('target_class', axis=1), df_train['target_class']
X_validation, y_validation = df_validation.drop('target_class', axis=1), df_validation['target_class']
X_test, y_test = df_test.drop('target_class', axis=1), df_test['target_class']

In [None]:
# Need to drop suspect_age column as the model does not accept data type
X_train = X_train.drop('suspect_age', axis=1)
X_validation = X_validation.drop('suspect_age', axis=1)
X_test = X_test.drop('suspect_age', axis=1)

In [None]:
# Convert the datasets to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_validation, label=y_validation)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
# Set up the XGBoost parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100,
}

In [None]:
# Add a watchlist to monitor the performance on the validation set
watchlist = [(dtrain, 'train'), (dvalidation, 'validation')]

In [None]:
# Train the model with early stopping
bst = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

In [None]:
# Make predictions using the trained model
y_pred = bst.predict(dtest)

In [None]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)

# Random Forest

In [21]:
# Import libraries
import statsmodels.tools.tools as stattools
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [28]:
# Separate predictor variables and target class
# Training data
x_rf = df_balanced[['latitude', 'longitude', 'n_guns_involved', 'group_Democrat', 'ohe_drug', 'ohe_officer', 'ohe_gang',
       'ohe_accident', 'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing', 'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc', 'ohe_drugs', 'ohe_car_jacking',
       'ohe_defensive', 'ohe_robbery', 'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass', 'ohe_domestic', 'suspect_age_group_Teen', 'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior', 'region_East South Central', 'region_Middle Atlantic', 'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central', 'region_West South Central']]
y_rf = df_balanced['target_class']

# Test data
x_test_rf = df_test[['latitude', 'longitude', 'n_guns_involved', 'group_Democrat', 'ohe_drug', 'ohe_officer', 'ohe_gang',
       'ohe_accident', 'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing', 'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc', 'ohe_drugs', 'ohe_car_jacking',
       'ohe_defensive', 'ohe_robbery', 'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass', 'ohe_domestic', 'suspect_age_group_Teen', 'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior', 'region_East South Central', 'region_Middle Atlantic', 'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central', 'region_West South Central']]
y_test_rf = df_test['target_class']

In [23]:
# Create response variable formatted as a 1D array as required by Python RF command
rfy = np.ravel(y_rf)

In [30]:
# Run RandomForestClassifier command to create the random forest model using the Gini index criterion 
rf01 = RandomForestClassifier (n_estimators= 100, criterion="gini").fit(x_rf, y_rf)

# To see classifications 
rf_predict = rf01.predict(x_test_rf)

In [33]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test_rf, rf_predict)
print("Accuracy:", accuracy)

Accuracy: 0.7880740606280632


In [32]:
conf_mat_rf=pd.crosstab(df_test['target_class'], rf_predict,
                             rownames = ['Actual'],
                             colnames = ['Predicted'],
                             margins = True)
conf_mat_rf

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3509,931,4440
1,1404,5174,6578
All,4913,6105,11018
