In [2]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = 'your-bucket' # Replace with your own bucket name if needed
print(bucket)
prefix = 'guidelines/supervised' #Replace with the prefix under which you want to store the data if needed

In [11]:
!head data/train.csv -n 3

4,ohio," the oiu team will also work with municipal prosecutors to take potential criminal actions against business owners who do not follow the order, which includes the requirement that patrons remain seated while eating/drinking and that parties stay six feet apart"
4,missouri," 
limit any gatherings in any one location to 10 people or less


In [12]:
!cat data/classes.txt

mask
social_distance
permission_reopen
workforce_allowed_to_work
gloves_requirement
work_from_home


In [13]:
index_to_label = {} 
with open("data/classes.txt") as f:
    for i,label in enumerate(f.readlines()):
        index_to_label[str(i+1)] = label.strip()
print(index_to_label)

{'1': 'mask', '2': 'social_distance', '3': 'permission_reopen', '4': 'workforce_allowed_to_work', '5': 'gloves_requirement', '6': 'work_from_home'}


In [14]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + index_to_label[row[0]]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[1].lower()))
    cur_row.extend(nltk.word_tokenize(row[2].lower()))
    return cur_row

In [16]:
def preprocess(input_file, output_file, keep=1):
    all_rows = []
    with open(input_file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    shuffle(all_rows)
    all_rows = all_rows[:int(keep*len(all_rows))]
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [17]:
%%time

preprocess('data/train.csv', 'guidelines.train', keep=1)
        
# Preparing the validation dataset        
preprocess('data/test.csv', 'guidelines.validation')

CPU times: user 72.4 ms, sys: 30.3 ms, total: 103 ms
Wall time: 979 ms


In [24]:
%%time

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='guidelines.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='guidelines.validation', bucket=bucket, key_prefix=validation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

CPU times: user 84.2 ms, sys: 4.85 ms, total: 89 ms
Wall time: 431 ms


In [18]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

## Training
Now that we are done with all the setup that is needed, we are ready to train our object detector. To begin, let us create a ``sageMaker.estimator.Estimator`` object. This estimator will launch the training job.

In [3]:
region_name = boto3.Session().region_name
print(region_name)

In [4]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

## Training the BlazingText model for supervised text classification

In [21]:
guidelines_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


Please refer to [algorithm documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext_hyperparameters.html) for the complete list of hyperparameters.

In [22]:
guidelines_model.set_hyperparameters(mode="supervised",
                            epochs=100,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=20,
                            early_stopping=True,
                            patience=6,
                            min_epochs=5,
                            word_ngrams=1)

In [25]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [5]:
guidelines_model.fit(inputs=data_channels, logs=True)

## Hosting / Inference
Once the training is done, we can deploy the trained model as an Amazon SageMaker real-time hosted endpoint. This will allow us to make predictions (or inference) from the model. Note that we don't have to host on the same type of instance that we used to train. Because instance endpoints will be up and running for long, it's advisable to choose a cheaper instance for inference.

In [22]:
text_classifier = guidelines_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge', endpoint_name = 'name')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------!

## Inference

#### Use JSON format for inference
BlazingText supports `application/json` as the content-type for inference. The payload should contain a list of sentences with the key as "**instances**" while being passed to the endpoint.

In [2]:
import sagemaker
from sagemaker import get_execution_role
sess = sagemaker.Session()

In [7]:
endpoint_name = 'name'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name, sagemaker_session=sess)

In [1]:
import json
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import pandas as pd
guidelines = pd.read_csv('inputfile.csv', encoding='latin')
guidelines['Guidelines'] = guidelines['Guidelines'].apply(lambda x: str(x))
guidelines['Guidelines'] = guidelines['Guidelines'].apply(lambda x: x.lower())
guidelines.head(1)

Unnamed: 0,State,Guidelines_Date,Guidelines,Links,LastDateRefreshed
0,alabama,09/02/2020,governor ivey issued an amended safer at home ...,https://governor.alabama.gov/assets/2020/08/Sa...,2020-09-08 07:20:28


In [4]:
guide_lines = guidelines.iloc[:,2:3]

In [5]:
guide_lines.Guidelines[0]

'governor ivey issued an amended safer at home order to extend the statewide mask mandate and existing covid-19 health order until october 2. the order requires every person wear a mask or face covering over their mouth and nose when within six feet of a person from another household in public, or in an outdoor space where more than ten people are gathered together.. '

In [38]:
final_df = pd.DataFrame(columns=['__label__mask', '__label__social_distance', '__label__work_from_home',
       '__label__workforce_allowed_to_work', '__label__gloves_requirement',
       '__label__permission_reopen'])
for i in range(len(guide_lines)): 
    sentences = guide_lines.Guidelines[i]
    tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]
    payload = {"instances" : tokenized_sentences, "configuration": {"k": 6}}
    response = predictor.predict(json.dumps(payload))
    predictions = json.loads(response)
    temp_df = pd.DataFrame([predictions[0]['prob']], columns=predictions[0]['label'])
    final_df = pd.concat([final_df, temp_df])
    final_df = final_df.reset_index(drop=True)

In [53]:
final_result_df = pd.concat([guidelines, final_df], axis=1)

In [55]:
final_result_df.to_csv('s3://bucket/guidelines/predicted_file/predicted_guidelines_latest.csv', index=False)

In [56]:
final_result_df.to_csv('predicted_guidelines_latest.csv', index=False)

In [112]:
final_result_df.to_csv('predicted_guidelines.csv', index=False)

In [8]:
sentences = ["the state is experiencing a sharp increase in covid-19 cases, and currently masks are not mandated but are only recommended",
            "the order will be replaced with an order allowing those businesses to voluntarily reopen under phase 1 rules in the north dakota smart restart plan"]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]
#print(tokenized_sentences)
payload = {"instances" : tokenized_sentences,
            "configuration": {"k": 6}}

response = predictor.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

In [58]:
sentences = ["the state is experiencing a sharp increase in covid-19 cases, and currently masks are not mandated but are only recommended"]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]
#print(tokenized_sentences)
payload = {"instances" : tokenized_sentences}

response = predictor.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.685702383518219
    ],
    "label": [
      "__label__mask"
    ]
  }
]


In [59]:
sentences = ["the order will be replaced with an order allowing those businesses to voluntarily reopen under phase 1 rules in the north dakota smart restart plan"]
# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]
#print(tokenized_sentences)
payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))


[
  {
    "prob": [
      0.9651257991790771
    ],
    "label": [
      "__label__permission_reopen"
    ]
  }
]


In [52]:
sentences = ["the funds may also be used to offset certain covid-19-related expenses, including: costs associated with deep cleaning and sanitization; technology to implement remote working; equipment which promotes social distancing; and costs related to modifying operations in order to continue to operate."]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]
#print(tokenized_sentences)
payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.5743077397346497
    ],
    "label": [
      "__label__workforce_allowed_to_work"
    ]
  }
]


By default, the model will return only one prediction, the one with the highest probability. For retrieving the top k predictions, you can set `k` in the configuration as shown below:

In [28]:
sentences = [" any individual who is over age two and able to medically tolerate a face-covering (a mask or cloth face-covering) shall be required to cover their nose and mouth with a face-covering when in a public place and unable to maintain a six-foot social distance"]
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences,
          "configuration": {"k": 6}}

response = predictor.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.6654945611953735,
      0.254910409450531,
      0.03512028604745865,
      0.033700861036777496,
      0.005965623073279858,
      0.004868263844400644
    ],
    "label": [
      "__label__mask",
      "__label__social_distance",
      "__label__work_from_home",
      "__label__workforce_allowed_to_work",
      "__label__gloves_requirement",
      "__label__permission_reopen"
    ]
  }
]


In [63]:
sentences = ["governor ivey issued an amended safer at home order to extend the statewide mask mandate and existing covid-19 health order until october 2. the order requires every person wear a mask or face covering over their mouth and nose when within six feet of a person from another household in public, or in an outdoor space where more than ten people are gathered together."]
tokenized_sentences = [' '.join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances" : tokenized_sentences,
          "configuration": {"k": 2}}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "prob": [
      0.8257442712783813,
      0.166083425283432
    ],
    "label": [
      "__label__mask",
      "__label__social_distance"
    ]
  }
]


### Stop / Close the Endpoint (Optional)
Finally, we should delete the endpoint before we close the notebook if we don't need to keep the endpoint running for serving realtime predictions.

In [64]:
sess.delete_endpoint(text_classifier.endpoint)