# Creating a Sentiment Analysis Web App Using PyTorch and SageMaker

## Read train and test data from cache file

In [13]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_analysis")  
os.makedirs(cache_dir, exist_ok=True)  

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):

    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            print("Unable to read from cache")  
    
    words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
            cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [14]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Read preprocessed data from cache file: preprocessed_data.pkl


## Create a word dictionary of most frequent 5000 words

In [15]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    
    word_count = {} 
    from collections import Counter
    flattened_list = [y for x in data for y in x]
    word_count = dict(Counter(flattened_list))
    
    sorted_words = []
    for key,value in sorted(word_count.items(),key = lambda x:x[1], reverse = True):
        sorted_words.append(key)

    word_dict = {} 
    for idx, word in enumerate(sorted_words[:vocab_size - 2]):
        word_dict[word] = idx + 2                              
        
    return word_dict

In [16]:
word_dict = build_dict(train_X)

In [17]:
data_dir = '../data/pytorch' 
if not os.path.exists(data_dir): 
    os.makedirs(data_dir)

In [18]:
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

##  Transform the reviews using bag of words

In [19]:
def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 
    INFREQ = 1 
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [20]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)
test_X, test_X_len = convert_and_pad_data(word_dict, test_X)

## Upload the training data to S3

In [21]:
import pandas as pd
    
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [22]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/sentiment_rnn'

role = sagemaker.get_execution_role()

In [23]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

## Build and Train the PyTorch Model

In [28]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="train.py",
                    source_dir="train",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    hyperparameters={
                        'epochs': 10,
                        'hidden_dim': 200,
                    })

In [29]:
estimator.fit({'training': input_data})

2020-04-19 18:53:36 Starting - Starting the training job...
2020-04-19 18:53:38 Starting - Launching requested ML instances......
2020-04-19 18:54:40 Starting - Preparing the instances for training......
2020-04-19 18:55:56 Downloading - Downloading input data...
2020-04-19 18:56:32 Training - Downloading the training image...
2020-04-19 18:57:03 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-04-19 18:57:03,679 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-04-19 18:57:03,703 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-04-19 18:57:03,706 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-04-19 18:57:03,955 sagemaker-containers INFO     Module train does not provide a setup.py. [

[34mModel loaded with embedding_dim 32, hidden_dim 200, vocab_size 5000.[0m
[34mEpoch: 1, BCELoss: 0.6690507324374452[0m
[34mEpoch: 2, BCELoss: 0.6041979826226527[0m
[34mEpoch: 3, BCELoss: 0.5103505801181404[0m
[34mEpoch: 4, BCELoss: 0.42610825263724034[0m
[34mEpoch: 5, BCELoss: 0.3892929365440291[0m
[34mEpoch: 6, BCELoss: 0.3622063635563364[0m
[34mEpoch: 7, BCELoss: 0.34563010626909685[0m
[34mEpoch: 8, BCELoss: 0.304587474282907[0m
[34mEpoch: 9, BCELoss: 0.2882863696740598[0m
[34mEpoch: 10, BCELoss: 0.27674087638757666[0m
[34m2020-04-19 19:00:22,202 sagemaker-containers INFO     Reporting training SUCCESS[0m

2020-04-19 19:00:32 Uploading - Uploading generated training model
2020-04-19 19:00:32 Completed - Training job completed
Training seconds: 276
Billable seconds: 276


## Deploy the model for the web app

In [30]:
from sagemaker.predictor import RealTimePredictor
from sagemaker.pytorch import PyTorchModel

class StringPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')

model = PyTorchModel(model_data=estimator.model_data,
                     role = role,
                     framework_version='0.4.0',
                     entry_point='predict.py',
                     source_dir='serve',
                     predictor_cls=StringPredictor)
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

--------------------------------------------------------------------------------------------------------!

In [32]:
predictor.endpoint

'sagemaker-pytorch-2020-04-19-19-10-21-103'

### Delete the endpoint

In [33]:
predictor.delete_endpoint()