In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker
from sagemaker import get_execution_role, image_uris
import numpy as np                                
import pandas as pd                                                          
from time import gmtime, strftime                  


In [2]:
# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/blazingtext-movie'
my_region = boto3.session.Session().region_name # set the region of the instance


In [3]:
bucket_name = 'sagemaker-movie-0' #Need unique bucket name
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [4]:
# Load the data
!wget https://github.com/hoarika727/movie-review-sentiment-classification/raw/main/train.csv
!wget https://github.com/hoarika727/movie-review-sentiment-classification/raw/main/test.csv

--2020-12-07 22:48:35--  https://github.com/hoarika727/movie-review-sentiment-classification/raw/main/train.csv
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/hoarika727/movie-review-sentiment-classification/main/train.csv [following]
--2020-12-07 22:48:35--  https://raw.githubusercontent.com/hoarika727/movie-review-sentiment-classification/main/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.200.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.200.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32251469 (31M) [text/plain]
Saving to: ‘train.csv’


2020-12-07 22:48:35 (225 MB/s) - ‘train.csv’ saved [32251469/32251469]

--2020-12-07 22:48:35--  https://github.com/hoarika727/movie-review-sentiment-classification/raw/main/test.cs

In [5]:
sess = sagemaker.Session()

sess.upload_data(path='train.csv', bucket=bucket_name)
sess.upload_data(path='test.csv', bucket=bucket_name)

's3://sagemaker-movie-0/data/test.csv'

In [6]:
try:
    train_data_key = 'train.csv'
    train_data_location = 's3://{}/data/{}'.format(bucket_name, train_data_key)

    train = pd.read_csv(train_data_location, index_col=0)

    test_data_key = 'test.csv'
    test_data_location = 's3://{}/data/{}'.format(bucket_name, test_data_key)

    test = pd.read_csv(test_data_location, index_col=0)
    print('Success: Train & test data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Train & test data loaded into dataframe.


In [8]:
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def transform_instance(row):
    cur_row = []
    label = "__label__" + row[1]  #Prefix the index-ed label with __label__
    cur_row.append(label)
    stop_words = stopwords.words('english')
    stop_words.append('br')
    review_tokens = nltk.word_tokenize(row[0].lower())
    review_tokens = [x for x in review_tokens if x not in stop_words]
    cur_row.extend(review_tokens)
    return cur_row

In [10]:
def preprocess(dataframe, output_file):
    all_rows = []
    for row in range(len(dataframe)):
        all_rows.append(list(dataframe.iloc[row].astype(str)))
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(transform_instance, all_rows)
    pool.close() 
    pool.join()
    
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        csv_writer.writerows(transformed_rows)

In [11]:
%%time

preprocess(train, 'movie.train')
preprocess(test, 'movie.test')

CPU times: user 13.5 s, sys: 412 ms, total: 13.9 s
Wall time: 1min 22s


In [12]:
%%time
sess = sagemaker.Session()
train_channel = prefix + '/train'
test_channel = prefix + '/test'

sess.upload_data(path='movie.train', bucket=bucket_name, key_prefix=train_channel)
sess.upload_data(path='movie.test', bucket=bucket_name, key_prefix=test_channel)

s3_train_data = 's3://{}/{}'.format(bucket_name, train_channel)
s3_test_data = 's3://{}/{}'.format(bucket_name, test_channel)

s3_output_location = 's3://{}/{}/output'.format(bucket_name, prefix)

CPU times: user 415 ms, sys: 195 ms, total: 609 ms
Wall time: 969 ms


In [13]:
container = sagemaker.image_uris.retrieve("blazingtext",my_region)
print('Using SageMaker BlazingText container: {} ({})'.format(container, my_region))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:1 (us-east-1)


In [14]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         instance_count=1, 
                                         instance_type='ml.c4.4xlarge',
                                         volume_size = 30,
                                         max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

In [15]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

In [16]:
train_data = sagemaker.inputs.TrainingInput(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
test_data = sagemaker.inputs.TrainingInput(s3_test_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': test_data}

In [17]:
%%time
bt_model.fit(inputs=data_channels, logs=True)

2020-12-07 22:52:33 Starting - Starting the training job...
2020-12-07 22:52:36 Starting - Launching requested ML instances......
2020-12-07 22:53:48 Starting - Preparing the instances for training......
2020-12-07 22:54:55 Downloading - Downloading input data
2020-12-07 22:54:55 Training - Downloading the training image..[34mArguments: train[0m
[34m[12/07/2020 22:55:12 INFO 140139262043520] nvidia-smi took: 0.025349855422973633 secs to identify 0 gpus[0m
[34m[12/07/2020 22:55:12 INFO 140139262043520] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[12/07/2020 22:55:12 INFO 140139262043520] Processing /opt/ml/input/data/train/movie.train . File size: 20.485380172729492 MB[0m
[34m[12/07/2020 22:55:12 INFO 140139262043520] Processing /opt/ml/input/data/validation/movie.test . File size: 19.468878746032715 MB[0m
[34mRead 3M words[0m
[34mNumber of words:  47937[0m
[34mLoading validation data

In [18]:
%%time
text_classifier = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')

---------------!CPU times: user 248 ms, sys: 22.6 ms, total: 271 ms
Wall time: 7min 32s


In [19]:
#text_classifier = sagemaker.predictor.Predictor(endpoint_name='blazingtext-2020-12-07-01-04-09-590')

In [21]:
s3cli = boto3.client('sagemaker-runtime')

sentences = [test['reviews'][0],
            test['reviews'][1]]

# tokenize the data same as training 
stop_words = stopwords.words('english')
tokenized_sentences = [' '.join([x for x in nltk.word_tokenize(sent) if x not in stop_words]) for sent in sentences]

payload = {"instances" : tokenized_sentences}

response = s3cli.invoke_endpoint(EndpointName=text_classifier.endpoint_name,
                               ContentType='application/json',
                               Body=json.dumps(payload))

predictions = json.loads(response["Body"].read().decode())
print(predictions)

[{'label': ['__label__0'], 'prob': [0.9262986779212952]}, {'label': ['__label__1'], 'prob': [0.9644984006881714]}]


In [22]:
# Check the true label of the predicted reviews - matched result
print(test.iloc[0])
print(test.iloc[1])

reviews    please give this one a miss br br kristy swans...
labels                                                     0
Name: 0, dtype: object
reviews    this film requires a lot of patience because i...
labels                                                     1
Name: 1, dtype: object


In [23]:
text_classifier.delete_endpoint()

#  AWS Sagemaker ML Model 2 - Supervised Model

In [26]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=500) #Limited to 500 features due to memory limit
train_reviews = vectorizer.fit_transform(train['reviews'])
train_reviews = np.array([t.tolist() for t in train_reviews.toarray()]).astype("float32")
x_train = pd.concat([train['labels'],pd.DataFrame(train_reviews),],axis=1)
x_train.to_csv('train_tfidf.csv', index=False, header=False)
print('train data in form of TF-IDF is generated')

test_reviews = vectorizer.transform(test['reviews'])
test_reviews = np.array([t.tolist() for t in test_reviews.toarray()]).astype("float32")
x_test = pd.concat([test['labels'],pd.DataFrame(test_reviews),],axis=1)
x_test.to_csv('test_tfidf.csv', index=False, header=False)
print('test data in form of TF-IDF is generated')


train data in form of TF-IDF is generated
test data in form of TF-IDF is generated
CPU times: user 26.5 s, sys: 614 ms, total: 27.1 s
Wall time: 27.3 s


In [27]:
prefix_1 = 'sagemaker/supervised-movie'
train_channel_1 = prefix_1 + '/tfidf_train'
test_channel_1 = prefix_1 + '/tfidf_test'

sess.upload_data(path='train_tfidf.csv', bucket=bucket_name, key_prefix=train_channel_1)
sess.upload_data(path='train_tfidf.csv', bucket=bucket_name, key_prefix=test_channel_1)

s3_train_data_1 = 's3://{}/{}'.format(bucket_name, train_channel_1)
s3_test_data_1 = 's3://{}/{}'.format(bucket_name, test_channel_1)

s3_output_location_1 = 's3://{}/{}/output'.format(bucket_name, prefix_1)


In [28]:
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/tfidf_train'.format(bucket_name, prefix_1), content_type='text/csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/tfidf_test'.format(bucket_name, prefix_1), content_type='text/csv')

sess = sagemaker.Session()
s3_output_location_1 = 's3://{}/{}/output'.format(bucket_name, prefix_1)
container_1 = sagemaker.image_uris.retrieve("linear-learner",my_region)
linearlr = sagemaker.estimator.Estimator(container_1,role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path=s3_output_location_1,
                                    sagemaker_session=sess)
linearlr.set_hyperparameters(feature_dim=500, 
                             predictor_type="binary_classifier",
                             mini_batch_size=200)


In [29]:
%%time
linearlr.fit({'train': s3_input_train, 'validation':s3_input_test})

2020-12-07 23:33:48 Starting - Starting the training job...
2020-12-07 23:33:50 Starting - Launching requested ML instances.........
2020-12-07 23:35:22 Starting - Preparing the instances for training......
2020-12-07 23:36:20 Downloading - Downloading input data...
2020-12-07 23:37:11 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m

2020-12-07 23:37:32 Training - Training image download completed. Training in progress.[34m[12/07/2020 23:37:37 INFO 140242406188864] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'inf

In [30]:
%%time
linearlr_predictor = linearlr.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')


-------------------!CPU times: user 300 ms, sys: 14.1 ms, total: 315 ms
Wall time: 9min 32s


In [65]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

s3cli = boto3.client('sagemaker-runtime')

test_samples = [x_test.iloc[0][1:],
                x_test.iloc[1][1:],
                x_test.iloc[2][1:]]

linearlr_predictor.serializer = CSVSerializer() # set the serializer type
linearlr_predictor.deserializer = JSONDeserializer()
predictions = linearlr_predictor.predict(test_samples) # predict!
print(predictions['predictions'])


[{'score': 0.5129913091659546, 'predicted_label': 1}, {'score': 0.9174268245697021, 'predicted_label': 1}, {'score': 0.5274257659912109, 'predicted_label': 1}]


In [66]:
print(test.iloc[0])
print(test.iloc[1])
print(test.iloc[2])

reviews    please give this one a miss br br kristy swans...
labels                                                     0
Name: 0, dtype: object
reviews    this film requires a lot of patience because i...
labels                                                     1
Name: 1, dtype: object
reviews    many animation buffs consider wladyslaw starew...
labels                                                     1
Name: 2, dtype: object


In [67]:
linearlr_predictor.delete_endpoint()

bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '6BE55E814812E4B1',
   'HostId': 'v65qySEffA0u+OM3UNbAG5U+4MhAezM+sOctD9BYtj7LyYR8/2KPImNISEE3weYHf6j/Q/HnUwg=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'v65qySEffA0u+OM3UNbAG5U+4MhAezM+sOctD9BYtj7LyYR8/2KPImNISEE3weYHf6j/Q/HnUwg=',
    'x-amz-request-id': '6BE55E814812E4B1',
    'date': 'Tue, 08 Dec 2020 00:51:14 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/blazingtext-movie/test/movie.test'},
   {'Key': 'sagemaker/supervised-movie/tfidf_test/train_tfidf.csv'},
   {'Key': 'sagemaker/blazingtext-movie/output/blazingtext-2020-12-07-22-52-33-644/output/model.tar.gz'},
   {'Key': 'data/train.csv'},
   {'Key': 'data/test.csv'},
   {'Key': 'sagemaker/blazingtext-movie/train/movie.train'},
   {'Key': 'sagemaker/supervised-movie/output/linear-learner-2020-12-07-23-33-48-514/output/model.tar.gz'