In [59]:
import boto3
import sagemaker
import time
import io
from time import gmtime, strftime
!pip install xgboost
import xgboost as xgb
import pandas as pd
import numpy as np
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect
from tqdm import tqdm  

[0m

In [60]:
region = boto3.Session().region_name
session = boto3.session.Session()

ec2 = boto3.Session().client(service_name="ec2", region_name=region)
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

xgboost_container = sagemaker.image_uris.retrieve("xgboost", 
                                                  my_region, 
                                                  "latest")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [61]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

In [62]:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

Role name: LabRole


In [63]:
sess = sagemaker.Session()
bucket = '{}'.format(bucket)
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket

'sagemaker-us-east-1-705927414280'

In [64]:
# input training parameters
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=\
        's3://{}/train/'.format(bucket), content_type='csv')

In [65]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/xgboost_output'.format(bucket),
                                    sagemaker_session=sess)
# parse in the hyperparameters
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,
                        subsample=0.8,silent=0,
                        objective='binary:logistic',num_round=100)

In [66]:
xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2023-04-16-20-04-48-987


2023-04-16 20:04:56 Starting - Starting the training job...
2023-04-16 20:05:11 Starting - Preparing the instances for training...
2023-04-16 20:05:58 Downloading - Downloading input data......
2023-04-16 20:06:39 Training - Downloading the training image...
2023-04-16 20:07:14 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2023-04-16:20:07:19:INFO] Running standalone xgboost training.[0m
[34m[2023-04-16:20:07:19:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2023-04-16:20:07:19:INFO] File size need to be processed in the node: 17.74mb. Available memory size in the node: 376.91mb[0m
[34m[2023-04-16:20:07:19:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:07:19] S3DistributionType set as FullyReplicated[0m
[34m[20:07:19] 198309x39 matrix with 7734051 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[20:07:20] src/tree/updater_prune.cc:74: tree pruning end,

In [67]:
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m5.large')

INFO:sagemaker:Creating model with name: xgboost-2023-04-16-20-08-45-333
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-04-16-20-08-45-333
INFO:sagemaker:Creating endpoint with name xgboost-2023-04-16-20-08-45-333


----!

In [68]:
test_path ="https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/df_test.csv"
train_path ="https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/df_train.csv"
train_df = pd.read_csv(train_path)
train_df
# test_df = pd.read_csv(test_path)
# cols = test_df.columns.tolist()
# cols.insert(0, cols.pop(cols.index('target_class')))
# test_df = test_df[cols]
# test_df.head()

Unnamed: 0,latitude,longitude,n_guns_involved,target_class,group_Democrat,ohe_drug,ohe_officer,ohe_gang,ohe_accident,ohe_murder,...,suspect_age_group_Adult,suspect_age_group_Senior,region_East South Central,region_Middle Atlantic,region_Mountain,region_New England,region_Pacific,region_South Atlantic,region_West North Central,region_West South Central
0,38.9943,-76.9921,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,40.8370,-96.7192,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,35.5976,-77.3774,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,40.6872,-89.6096,2.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,36.5394,-119.2920,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198304,40.8680,-73.8959,1.0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
198305,41.2159,-75.9208,1.0,1,0,0,1,0,1,1,...,0,0,0,1,0,0,0,0,0,0
198306,38.5545,-90.4080,1.0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
198307,39.9323,-75.1179,2.0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [69]:
from sagemaker.serializers import CSVSerializer

# loading data into the array
test_array = test_df.drop(['target_class'], axis=1).values

# setting serializer type
xgb_predictor.serializer = CSVSerializer() 

# predicting
predictions = xgb_predictor.predict(test_array).decode('utf-8') 

# make predictions into an array
predictions_array = np.fromstring(predictions[1:], sep=',') 
print(predictions_array.shape)

(11018,)


In [70]:
cm = pd.crosstab(index=test_df['target_class'], 
                 columns=np.round(predictions_array), 
                 rownames=['Observed'], 
                 colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; 
fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Injury", 
                                    " Injured/Killed"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Injury", \
                                        tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Injured/Killed", \
                                        fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 80.7%

Predicted      No Injury       Injured/Killed
Observed
No Injury      81% (2994)    20% (1446)
Injured/Killed  19% (683)     80% (5895) 



In [71]:
# clean-up by deleteting endpoint
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2023-04-16-20-08-45-333
INFO:sagemaker:Deleting endpoint with name: xgboost-2023-04-16-20-08-45-333
