In [3]:
import boto3

region = boto3.Session().region_name
session = boto3.session.Session()

ec2 = boto3.Session().client(service_name="ec2", region_name=region)
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [22]:
import sagemaker
import time
from time import gmtime, strftime

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

In [23]:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

Role name: LabRole


In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install --disable-pip-version-check -q PyAthena==2.1.0
! pip install descartes
! pip install geopandas
!pip install wordcloud
from pyathena import connect
import geopandas as gpd
from tqdm import tqdm  
from geopandas import GeoDataFrame, points_from_xy
from wordcloud import WordCloud

[0m

In [25]:
sess = sagemaker.Session()
bucket = '{}'.format(bucket)
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket

'sagemaker-us-east-1-898900188658'

In [26]:
# Assign database name
database_name = "ads508"

In [27]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [28]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [29]:
# Create new database 'ads508'
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
create_db = pd.read_sql(statement, conn)
create_db

In [30]:
# Verify database creation
q = "SHOW DATABASES"
db_show = pd.read_sql(q, conn)
db_show

Unnamed: 0,database_name
0,ads508
1,default
2,dsoaws
3,sagemaker_featurestore


In [31]:
# Set Athena parameters
database_name = "ads508"
model_table_name_csv = "modeling"
model_s3_path = "s3://{}/modeling_data".format(bucket)
print(model_s3_path)

s3://sagemaker-us-east-1-898900188658/modeling_data


In [32]:
df = pd.read_csv("{}/data_for_modeling.csv".format(model_s3_path))
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,latitude,longitude,n_guns_involved,target_class,group_Democrat,suspect_age,ohe_drug,ohe_officer,ohe_gang,ohe_accident,...,suspect_age_group_Adult,suspect_age_group_Senior,region_East South Central,region_Middle Atlantic,region_Mountain,region_New England,region_Pacific,region_South Atlantic,region_West North Central,region_West South Central
0,40.3467,-79.8559,1.0,1,1,Adult 18+,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,40.4555,-79.897,1.0,1,1,,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,33.909,-118.333,1.0,1,1,,0,0,1,1,...,0,0,0,0,0,0,1,0,0,0
3,33.8447,-118.307,1.0,1,1,Adult 18+,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,33.9454,-118.399,1.0,1,1,,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [33]:
print(len(df))
df = df.dropna()
print(len(df))

220346
111278


In [34]:
# statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
#  latitude double,
#  longitude double,
#  n_guns_involved double,
#  target_class int,
#  group_Democrat int,
#  suspect_age string,
#  ohe_drug int,
#  ohe_officer int,
#  ohe_gang int,
#  ohe_accident int,
#  ohe_murder int,
#  ohe_suicide int,
#  ohe_arrest int,
#  ohe_brandishing int,
#  ohe_felon int,
#  ohe_drive int,
#  ohe_home_invasion int,
#  ohe_stolen int,
#  ohe_misc int,
#  ohe_drugs int,
#  ohe_car_jacking int,
#  ohe_defensive int,
#  ohe_robbery int,
#  ohe_family int,
#  ohe_institution int,
#  ohe_child int,
#  ohe_mass int,
#  ohe_domestic int,
#  suspect_age_group_Teen int,
#  suspect_age_group_Young_Adult int,
#  suspect_age_group_Mid-Adult int,
#  suspect_age_group_Adult int,
#  suspect_age_group_Senior int,
#  region_East_South_Central int,
#  region_Middle_Atlantic int,
#  region_Mountain int,
#  region_New_England int,
#  region_Pacific int,
#  region_South_Atlantic int,
#  region_West_North_Central int,
#  region_West_South_Central int
 
# ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
# TBLPROPERTIES ('skip.header.line.count'='1')""".format(
#     database_name, model_table_name_csv, model_s3_path
# )

# create_table = pd.read_sql(statement, conn)
# create_table

Displaying the previous count of the target class variable. 
We find that the class of 1 - which indicates someone was either injured
or killed - is the majority class.

In [35]:
# count the number of 0s and 1s in the 'outcome' column
value_counts = df['target_class'].value_counts()

# print the results
print(value_counts)

0    55953
1    55325
Name: target_class, dtype: int64


# Undersampling the majority target_class of '1' 

In [36]:
df_grouped_by = df.groupby(["target_class"])
df_balanced = df_grouped_by.apply(
    lambda x: x.sample(df_grouped_by.size().min())\
    .reset_index(drop=True)
)

In [37]:
# showing the balanced value counts of the newly created dataframe
value_counts_balanced = df_balanced['target_class'].value_counts()

# print the results
print(value_counts_balanced)

0    55325
1    55325
Name: target_class, dtype: int64


In [38]:
from sklearn.model_selection import train_test_split

# Split all data into 90% train and 10% holdout
df_train, df_holdout = train_test_split(
        df_balanced,
        test_size=0.10,
    stratify=df_balanced['target_class'])

# Split holdout data into 50% validation and 50% test
df_validation, df_test = train_test_split(
        df_holdout,
        test_size=0.50,
        stratify=df_holdout['target_class'])

In [41]:
print(df_test.columns)

Index(['latitude', 'longitude', 'n_guns_involved', 'group_Democrat',
       'suspect_age', 'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident',
       'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing',
       'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc',
       'ohe_drugs', 'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery',
       'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass',
       'ohe_domestic', 'suspect_age_group_Teen',
       'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior',
       'region_East South Central', 'region_Middle Atlantic',
       'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central',
       'region_West South Central'],
      dtype='object')


In [42]:
print(df_train.columns)

Index(['latitude', 'longitude', 'n_guns_involved', 'target_class',
       'group_Democrat', 'suspect_age', 'ohe_drug', 'ohe_officer', 'ohe_gang',
       'ohe_accident', 'ohe_murder', 'ohe_suicide', 'ohe_arrest',
       'ohe_brandishing', 'ohe_felon', 'ohe_drive', 'ohe_home_invasion',
       'ohe_stolen', 'ohe_misc', 'ohe_drugs', 'ohe_car_jacking',
       'ohe_defensive', 'ohe_robbery', 'ohe_family', 'ohe_institution',
       'ohe_child', 'ohe_mass', 'ohe_domestic', 'suspect_age_group_Teen',
       'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior',
       'region_East South Central', 'region_Middle Atlantic',
       'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central',
       'region_West South Central'],
      dtype='object')


In [43]:
# Transfer Training Data to S3 bucket - exclude header and index
import io

s3_client = boto3.client("s3")
BUCKET='sagemaker-us-east-1-898900188658'
KEY='knn/train/train.csv'
#response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

df_y_train = df_train['target_class']
df_features = df_train.drop(columns=['target_class'])

#df_y_test = df_test['target_class']
#df_test = df_test.drop(columns=['target_class'])

with io.StringIO() as csv_buffer:
    df_train.to_csv(csv_buffer, index=False, header=False)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )

## Neural Network

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, plot_confusion_matrix

In [45]:
print(df_features.shape)
print(df_y_train.shape)
#print(df_features.columns)
#print(df_features.head(5))
df_features = df_features.drop(columns=['suspect_age'])

(99585, 40)
(99585,)


In [46]:
parameters = {'solver':('adam', 'sgd'), 'activation':('relu', 'tanh'), 
              'hidden_layer_sizes':[2, 4], 'max_iter': [200, 400]} 
nn = GridSearchCV(MLPClassifier(), parameters) #finetune t find best parameters
nn.fit(df_features, df_y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=None, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_gr

In [47]:
nn.best_params_ #best parameters

{'activation': 'tanh',
 'hidden_layer_sizes': 4,
 'max_iter': 200,
 'solver': 'adam'}

In [48]:
nn.best_score_ #best score

0.7966059145453633

## KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [50]:
print(df_test.columns)

Index(['latitude', 'longitude', 'n_guns_involved', 'group_Democrat',
       'suspect_age', 'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident',
       'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing',
       'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc',
       'ohe_drugs', 'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery',
       'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass',
       'ohe_domestic', 'suspect_age_group_Teen',
       'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior',
       'region_East South Central', 'region_Middle Atlantic',
       'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central',
       'region_West South Central'],
      dtype='object')


In [51]:
df_test = df_test.drop(columns=['suspect_age'])

In [52]:
print(df_test.columns)

Index(['latitude', 'longitude', 'n_guns_involved', 'group_Democrat',
       'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident', 'ohe_murder',
       'ohe_suicide', 'ohe_arrest', 'ohe_brandishing', 'ohe_felon',
       'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc', 'ohe_drugs',
       'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery', 'ohe_family',
       'ohe_institution', 'ohe_child', 'ohe_mass', 'ohe_domestic',
       'suspect_age_group_Teen', 'suspect_age_group_Young Adult',
       'suspect_age_group_Mid-Adult', 'suspect_age_group_Adult',
       'suspect_age_group_Senior', 'region_East South Central',
       'region_Middle Atlantic', 'region_Mountain', 'region_New England',
       'region_Pacific', 'region_South Atlantic', 'region_West North Central',
       'region_West South Central'],
      dtype='object')


In [53]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(df_features, df_y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [54]:
print(df_test.columns)

Index(['latitude', 'longitude', 'n_guns_involved', 'group_Democrat',
       'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident', 'ohe_murder',
       'ohe_suicide', 'ohe_arrest', 'ohe_brandishing', 'ohe_felon',
       'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc', 'ohe_drugs',
       'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery', 'ohe_family',
       'ohe_institution', 'ohe_child', 'ohe_mass', 'ohe_domestic',
       'suspect_age_group_Teen', 'suspect_age_group_Young Adult',
       'suspect_age_group_Mid-Adult', 'suspect_age_group_Adult',
       'suspect_age_group_Senior', 'region_East South Central',
       'region_Middle Atlantic', 'region_Mountain', 'region_New England',
       'region_Pacific', 'region_South Atlantic', 'region_West North Central',
       'region_West South Central'],
      dtype='object')


In [55]:
# Predict on dataset which model has not seen before
#print(df_test.columns)
#df_y_test = df_test['target_class']
#df_labels = df_test.drop(columns=['suspect_age'])
print(knn.predict(df_test))

print(knn.score(df_test, df_y_test))

[1 0 0 ... 0 1 0]
0.7771552503162841


In [None]:
"""
# load training data from S3 bucket

import io
import sagemaker.amazon.common as smac

print("train_features shape = ", train_features.shape)
print("train_labels shape = ", train_labels.shape)

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, train_features, train_labels)
buf.seek(0)
"""

In [None]:
"""
import boto3
import os
import sagemaker

bucket = sagemaker.Session().default_bucket()  # modify to your bucket name
prefix = "knn"
key = "knn-training-dataset"

boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"
print(f"uploaded training data location: {s3_train_data}")
"""

In [56]:
#print(df_knn_train.columns)

NameError: name 'df_knn_train' is not defined

In [57]:
df_knn_train = df_train[['target_class', 'latitude', 'longitude', 'n_guns_involved', 'group_Democrat',
       'suspect_age', 'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident',
       'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing',
       'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc',
       'ohe_drugs', 'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery',
       'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass',
       'ohe_domestic', 'suspect_age_group_Teen',
       'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior',
       'region_East South Central', 'region_Middle Atlantic',
       'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central',
       'region_West South Central']]

In [58]:
df_knn_train = df_train.drop(columns=['suspect_age'])

In [59]:
df_knn_train.columns

Index(['latitude', 'longitude', 'n_guns_involved', 'target_class',
       'group_Democrat', 'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident',
       'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing',
       'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc',
       'ohe_drugs', 'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery',
       'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass',
       'ohe_domestic', 'suspect_age_group_Teen',
       'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior',
       'region_East South Central', 'region_Middle Atlantic',
       'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central',
       'region_West South Central'],
      dtype='object')

In [60]:
df_knn_train = df_knn_train.reset_index(drop=True)
df_knn_train = df_knn_train[['target_class', 'latitude', 'longitude', 'n_guns_involved',
       'group_Democrat', 'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident',
       'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing',
       'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc',
       'ohe_drugs', 'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery',
       'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass',
       'ohe_domestic', 'suspect_age_group_Teen',
       'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior',
       'region_East South Central', 'region_Middle Atlantic',
       'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central',
       'region_West South Central']]
print(df_knn_train.head(5))

   target_class  latitude  longitude  n_guns_involved  group_Democrat  \
0             0   32.3190   -95.2115              2.0               0   
1             0   31.0952   -97.7210              1.0               0   
2             0   38.7931   -90.4085              1.0               1   
3             1   30.0066   -91.8308              1.0               0   
4             0   36.0412   -79.7727              1.0               1   

   ohe_drug  ohe_officer  ohe_gang  ohe_accident  ohe_murder  ...  \
0         0            1         0             0           0  ...   
1         0            0         0             0           0  ...   
2         0            0         1             0           0  ...   
3         0            0         0             0           0  ...   
4         0            0         0             0           0  ...   

   suspect_age_group_Adult  suspect_age_group_Senior  \
0                        0                         0   
1                        0        

In [61]:
# Transfer Training Data to S3 bucket
s3_client = boto3.client("s3")
BUCKET='sagemaker-us-east-1-898900188658'
KEY='knn/train/train-April10.csv'

with io.StringIO() as csv_buffer:
    df_knn_train.to_csv(csv_buffer, index=False, header=False)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )

In [62]:
"""
df_knn_test = df_test.reset_index(drop=True)
df_knn_test = df_knn_test[['target_class', 'latitude', 'longitude', 'n_guns_involved',
       'group_Democrat', 'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident',
       'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing',
       'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc',
       'ohe_drugs', 'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery',
       'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass',
       'ohe_domestic', 'suspect_age_group_Teen',
       'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior',
       'region_East South Central', 'region_Middle Atlantic',
       'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central',
       'region_West South Central']]
print(df_knn_test.head(5))
"""

KeyError: "['target_class'] not in index"

In [63]:
print(df_test.columns)

Index(['latitude', 'longitude', 'n_guns_involved', 'group_Democrat',
       'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident', 'ohe_murder',
       'ohe_suicide', 'ohe_arrest', 'ohe_brandishing', 'ohe_felon',
       'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc', 'ohe_drugs',
       'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery', 'ohe_family',
       'ohe_institution', 'ohe_child', 'ohe_mass', 'ohe_domestic',
       'suspect_age_group_Teen', 'suspect_age_group_Young Adult',
       'suspect_age_group_Mid-Adult', 'suspect_age_group_Adult',
       'suspect_age_group_Senior', 'region_East South Central',
       'region_Middle Atlantic', 'region_Mountain', 'region_New England',
       'region_Pacific', 'region_South Atlantic', 'region_West North Central',
       'region_West South Central'],
      dtype='object')


In [64]:
# Transfer Training Data to S3 bucket
s3_client = boto3.client("s3")
BUCKET='sagemaker-us-east-1-898900188658'
KEY='knn/test/test-April10.csv'

with io.StringIO() as csv_buffer:
    df_knn_test.to_csv(csv_buffer, index=False, header=False)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )

In [65]:
"""
s3_client = boto3.client("s3")
BUCKET='sagemaker-us-east-1-898900188658'
KEY='raw_files/knn/training_dataset.csv'
#response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

#df_target = df_train['target_class']
#df_knn_train = df_train.drop('target_class', axis=1)



with io.StringIO() as csv_buffer:
    df_knn_train.to_csv(csv_buffer, index=False, header=False)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )
    """

'\ns3_client = boto3.client("s3")\nBUCKET=\'sagemaker-us-east-1-898900188658\'\nKEY=\'raw_files/knn/training_dataset.csv\'\n#response = s3_client.get_object(Bucket=BUCKET, Key=KEY)\n\n#df_target = df_train[\'target_class\']\n#df_knn_train = df_train.drop(\'target_class\', axis=1)\n\n\n\nwith io.StringIO() as csv_buffer:\n    df_knn_train.to_csv(csv_buffer, index=False, header=False)\n\n    response = s3_client.put_object(\n        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()\n    )\n    '

In [66]:
import matplotlib.pyplot as plt
import sagemaker
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.amazon.amazon_estimator import get_image_uri


def trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path, s3_test_data=None):
    """
    Create an Estimator from the given hyperparams, fit to training data,
    and return a deployed predictor

    """
    # set up the estimator
    knn = sagemaker.estimator.Estimator(
        get_image_uri(boto3.Session().region_name, "knn"),
        get_execution_role(),
        instance_count=1,
        instance_type="ml.m5.2xlarge",
        output_path=output_path,
        sagemaker_session=sagemaker.Session(),
    )
    knn.set_hyperparameters(**hyperparams)

    # train a model. fit_input contains the locations of the train and test data
    fit_input = {"train": s3_train_data}
    if s3_test_data is not None:
        fit_input["test"] = s3_test_data
    knn.fit(fit_input)
    return knn

In [67]:
# input training parameters
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=\
        's3://{}/knn/train/'.format(bucket), content_type='csv')

In [68]:
# input training parameters
s3_input_test = sagemaker.inputs.TrainingInput(s3_data=\
        's3://{}/knn/test/'.format(bucket), content_type='csv')

In [69]:
print(df_test.columns)


Index(['latitude', 'longitude', 'n_guns_involved', 'group_Democrat',
       'ohe_drug', 'ohe_officer', 'ohe_gang', 'ohe_accident', 'ohe_murder',
       'ohe_suicide', 'ohe_arrest', 'ohe_brandishing', 'ohe_felon',
       'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc', 'ohe_drugs',
       'ohe_car_jacking', 'ohe_defensive', 'ohe_robbery', 'ohe_family',
       'ohe_institution', 'ohe_child', 'ohe_mass', 'ohe_domestic',
       'suspect_age_group_Teen', 'suspect_age_group_Young Adult',
       'suspect_age_group_Mid-Adult', 'suspect_age_group_Adult',
       'suspect_age_group_Senior', 'region_East South Central',
       'region_Middle Atlantic', 'region_Mountain', 'region_New England',
       'region_Pacific', 'region_South Atlantic', 'region_West North Central',
       'region_West South Central'],
      dtype='object')


In [70]:
# run actual training job
prefix = 'raw_files/train'
hyperparams = {"feature_dim": 54, "k": 10, "sample_size": 200000, "predictor_type": "classifier"}
output_path = f"s3://{bucket}/{prefix}"
"""
knn_estimator = trained_estimator_from_hyperparams(
    s3_train_data, hyperparams, output_path, s3_test_data=s3_test_data
)
"""
knn_estimator = trained_estimator_from_hyperparams(
    s3_input_train, hyperparams, output_path, s3_test_data=s3_input_test
)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: knn-2023-04-10-22-49-06-277


ClientError: An error occurred (AccessDeniedException) when calling the CreateTrainingJob operation: User: arn:aws:sts::898900188658:assumed-role/LabRole/SageMaker is not authorized to perform: sagemaker:CreateTrainingJob on resource: arn:aws:sagemaker:us-east-1:898900188658:training-job/knn-2023-04-10-22-49-06-277 with an explicit deny in an identity-based policy

In [None]:
def predictor_from_estimator(knn_estimator, estimator_name, instance_type, endpoint_name=None):
    knn_predictor = knn_estimator.deploy(
        initial_instance_count=1, instance_type=instance_type, endpoint_name=endpoint_name
    )
    knn_predictor.serializer = CSVSerializer()
    knn_predictor.deserializer = JSONDeserializer()
    return knn_predictor

In [None]:
import time

instance_type = "ml.m4.xlarge"
model_name = "knn_%s" % instance_type
endpoint_name = "knn-ml-m4-xlarge-%s" % (str(time.time()).replace(".", "-"))
print("setting up the endpoint..")
predictor = predictor_from_estimator(
    knn_estimator, model_name, instance_type, endpoint_name=endpoint_name
)

## Neural Network