In [1]:
%mkdir ../data
!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

mkdir: cannot create directory ‘../data’: File exists
--2021-12-26 18:06:10--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘../data/aclImdb_v1.tar.gz’


2021-12-26 18:06:11 (59.5 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



## Prepare and Process Data

In [2]:
import os
import glob

def read_imdb_data(data_dir='../data/aclImdb'):
    data = {}
    labels = {}
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
                    
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)
                
    return data, labels

In [3]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [4]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    """Prepare training and test sets from IMDb movie reviews."""
    
    #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
    
    # Return a unified training data, test data, training labels, test labets
    return data_train, data_test, labels_train, labels_test

In [5]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


In [6]:
train_X[100]

"I saw Brigadoon on TV last night (12 Sept 2009). I am 61 years old and have been watching films as long as I can remember. I can truthfully say that Brigadoon stands alone as by far and away the worst film I have ever seen. The accents were shameful. The local children's club would have produced better sets. The characters were so wooden that they probably contracted dry rot from the tears of the patrons who had the misfortune to watch them. It is to be hoped that the stars of this film had hides thick enough to protect them from the embarrassment which they must have suffered on seeing this film. The owners of this tripe should perform a great service to mankind and destroy all copies of this film."

## Process the Data

In [7]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def review_to_words(review):
    words = REPLACE_NO_SPACE.sub("", review.lower())
    words = REPLACE_WITH_SPACE.sub(" ", words)
    return words

In [8]:
review_to_words(train_X[100])

'i saw brigadoon on tv last night 12 sept 2009 i am 61 years old and have been watching films as long as i can remember i can truthfully say that brigadoon stands alone as by far and away the worst film i have ever seen the accents were shameful the local childrens club would have produced better sets the characters were so wooden that they probably contracted dry rot from the tears of the patrons who had the misfortune to watch them it is to be hoped that the stars of this film had hides thick enough to protect them from the embarrassment which they must have suffered on seeing this film the owners of this tripe should perform a great service to mankind and destroy all copies of this film'

In [9]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_xgboost")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="prepocessed_data_xgboost.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [10]:
# Preprocess data
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Wrote preprocessed data to cache file: prepocessed_data_xgboost.pkl


## Extract Bag-of-Words Features

In [13]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import joblib

def extract_BoW_features(words_train, words_test, vocabulary_size=5000,
                         cache_dir=cache_dir, cache_file="bow_features.pkl"):
    """Extract Bag-of-Words for a given set of documents, already preprocessed into words."""
    
    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Fit a vectorizer to training documents and use it to transform them
        vectorizer = CountVectorizer(max_features=vocabulary_size)
        features_train = vectorizer.fit_transform(words_train).toarray()

        # Apply the same vectorizer to transform the test documents (ignore unknown words)
        features_test = vectorizer.transform(words_test).toarray()
        
        # NOTE: Remember to convert the features using .toarray() for a compact representation
        
        # Write to cache file for future runs (store vocabulary as well)
        if cache_file is not None:
            vocabulary = vectorizer.vocabulary_
            cache_data = dict(features_train=features_train, features_test=features_test,
                             vocabulary=vocabulary)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                joblib.dump(cache_data, f)
            print("Wrote features to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        features_train, features_test, vocabulary = (cache_data['features_train'],
                cache_data['features_test'], cache_data['vocabulary'])
    
    # Return both the extracted features as well as the vocabulary
    return features_train, features_test, vocabulary

In [14]:
# Extract Bag of Words features for both training and test datasets
train_X, test_X, vocabulary = extract_BoW_features(train_X, test_X)

Read features from cache file: bow_features.pkl


In [15]:
len(train_X[100])

5000

## Upload Data to S3

In [16]:
import pandas as pd
import sagemaker
import boto3
import numpy as np
import os
from sagemaker.amazon.amazon_estimator import get_image_uri

In [17]:
val_X = pd.DataFrame(train_X[:10000])
train_X = pd.DataFrame(train_X[10000:])

val_y = pd.DataFrame(train_y[:10000])
train_y = pd.DataFrame(train_y[10000:])

In [18]:
data_dir = '../data/sentiment_web_app'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [19]:
pd.DataFrame(test_X).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([val_y, val_X], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [37]:
sagemaker_session = sagemaker.Session()
boto_session = boto3.session.Session()
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
region = boto_session.region_name
prefix = 'sagemaker/sentiment_xgboost'

In [40]:
print("Boto Session: ",boto_session)
print("Sagemaker Session: ",sagemaker_session)
print("Role :",role)
print("S3 Bucket :",bucket)
print("S3 Prefix :",prefix)

Boto Session:  Session(region_name='us-west-2')
Sagemaker Session:  <sagemaker.session.Session object at 0x7f0931ffa978>
Role : arn:aws:iam::904606187431:role/service-role/AmazonSageMaker-ExecutionRole-20211223T092713
S3 Bucket : sagemaker-us-west-2-904606187431
S3 Prefix : sagemaker/sentiment_xgboost


In [41]:
test_location = sagemaker_session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = sagemaker_session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = sagemaker_session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

## Create the XGboost Model

In [43]:
training_instance_type = 'ml.m4.xlarge'

# retrieve xgboost image
image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",
    py_version="py3",
    instance_type=training_instance_type,
)

In [45]:
# First we create a SageMaker estimator object for our model.
xgb = sagemaker.estimator.Estimator(image_uri=image_uri, # The location of the container we wish to use
                                    role=role,                                    # What is our current IAM Role
                                    instance_count=1,                  # How many compute instances
                                    instance_type=training_instance_type,      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

## Train the Model

In [48]:
print(test_location)
print(val_location)
print(train_location)

s3://sagemaker-us-west-2-904606187431/sagemaker/sentiment_xgboost/test.csv
s3://sagemaker-us-west-2-904606187431/sagemaker/sentiment_xgboost/validation.csv
s3://sagemaker-us-west-2-904606187431/sagemaker/sentiment_xgboost/train.csv


In [49]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(train_location, content_type="text/csv")
val_input = TrainingInput(val_location, content_type="text/csv")

In [50]:
xgb.fit({'train': train_input, 'validation': val_input})

2021-12-26 18:53:22 Starting - Starting the training job...
2021-12-26 18:53:24 Starting - Launching requested ML instancesProfilerReport-1640544802: InProgress
......
2021-12-26 18:54:46 Starting - Preparing the instances for training.........
2021-12-26 18:56:16 Downloading - Downloading input data...
2021-12-26 18:56:46 Training - Downloading the training image...
2021-12-26 18:57:11 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:r

## Perform a Batch Transform Job to Test

In [52]:
xgb_transformer = xgb.transformer(instance_count=1, 
                                  instance_type='ml.m4.xlarge')

In [53]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

..................................[34m[2021-12-26:19:09:07:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-12-26:19:09:07:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-12-26:19:09:07:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }
    loc

In [60]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-west-2-904606187431/sagemaker-xgboost-2021-12-26-19-03-37-766/test.csv.out to ../data/sentiment_web_app/test.csv.out


In [61]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

0.86036

## Create Serverless Endpoint

In [57]:
from time import gmtime, strftime


client = boto3.client(service_name="sagemaker")
runtime = boto3.client(service_name="sagemaker-runtime")

model_name = "xgboost-serverless" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Model name: " + model_name)
model_artifacts = xgb.model_data
print(model_artifacts)

# environment variables
container_env_vars = {"SAGEMAKER_CONTAINER_LOG_LEVEL": "20"}

create_model_response = client.create_model(
    ModelName=model_name,
    Containers=[
        {
            "Image": image_uri,
            "Mode": "SingleModel",
            "ModelDataUrl": model_artifacts,
            "Environment": container_env_vars,
        }
    ],
    ExecutionRoleArn=role,
)

print("Model Arn: " + create_model_response["ModelArn"])

Model name: xgboost-serverless2021-12-26-19-15-00
s3://sagemaker-us-west-2-904606187431/sagemaker/sentiment_xgboost/output/sagemaker-xgboost-2021-12-26-18-53-22-665/output/model.tar.gz
Model Arn: arn:aws:sagemaker:us-west-2:904606187431:model/xgboost-serverless2021-12-26-19-15-00


### Config Endpoint

In [59]:
xgboost_epc_name = "xgboost-serverless-epc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=xgboost_epc_name,
    ProductionVariants=[
        {
            "VariantName": "byoVariant",
            "ModelName": model_name,
            "ServerlessConfig": {
                "MemorySizeInMB": 4096,
                "MaxConcurrency": 1,
            },
        },
    ],
)

print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

Endpoint Configuration Arn: arn:aws:sagemaker:us-west-2:904606187431:endpoint-config/xgboost-serverless-epc2021-12-26-19-16-08


In [63]:
endpoint_name = "xgboost-serverless-ep" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=xgboost_epc_name,
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

Endpoint Arn: arn:aws:sagemaker:us-west-2:904606187431:endpoint/xgboost-serverless-ep2021-12-26-19-18-11


In [64]:
# wait for endpoint to reach a terminal state (InService) using describe endpoint
import time

describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)

while describe_endpoint_response["EndpointStatus"] == "Creating":
    describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)
    print(describe_endpoint_response["EndpointStatus"])
    time.sleep(15)

describe_endpoint_response

Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
InService


{'EndpointName': 'xgboost-serverless-ep2021-12-26-19-18-11',
 'EndpointArn': 'arn:aws:sagemaker:us-west-2:904606187431:endpoint/xgboost-serverless-ep2021-12-26-19-18-11',
 'EndpointConfigName': 'xgboost-serverless-epc2021-12-26-19-16-08',
 'ProductionVariants': [{'VariantName': 'byoVariant',
   'DeployedImages': [{'SpecifiedImage': '246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3',
     'ResolvedImage': '246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost@sha256:04889b02181f14632e19ef6c2a7d74bfe699ff4c7f44669a78834bc90b77fe5a',
     'ResolutionTime': datetime.datetime(2021, 12, 26, 19, 18, 13, 756000, tzinfo=tzlocal())}],
   'CurrentWeight': 1.0,
   'DesiredWeight': 1.0,
   'CurrentInstanceCount': 0,
   'CurrentServerlessConfig': {'MemorySizeInMB': 4096, 'MaxConcurrency': 1}}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2021, 12, 26, 19, 18, 11, 785000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 12, 26

## Test Endpoint

In [83]:
# test_review = "Nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies, totally devoid of any heart or heat. A romantic comedy that has zero romantic chemestry and zero laughs!"

test_review="Crappy"


In [84]:
test_words = review_to_words(test_review)
print(test_words)

crappy


In [85]:
def bow_encoding(words, vocabulary):
    bow = [0] * len(vocabulary) # Start by setting the count for each word in the vocabulary to zero.
    for word in words.split():  # For each word in the string
        if word in vocabulary:  # If the word is one that occurs in the vocabulary, increase its count.
            bow[vocabulary[word]] += 1
    return bow

In [86]:
test_bow = bow_encoding(test_words, vocabulary)
print(len(test_bow))

5000


In [87]:
endpoint_name

'xgboost-serverless-ep2021-12-26-19-18-11'

In [88]:
response = runtime.invoke_endpoint(EndpointName = endpoint_name, # The name of the endpoint we created
                                   ContentType = 'text/csv',                     # The data format that is expected
                                   Body = ','.join([str(val) for val in test_bow]).encode('utf-8'))

In [89]:
response = response['Body'].read().decode('utf-8')
print(response)

0.6226286292076111
