In [1]:
import os
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2.1.0


In [2]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [3]:
articles = []
labels = []

with open("bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)
print("Number of Labels:{}".format(len(labels)))
print("Number of Articles:{}".format(len(articles)))

Number of Labels:2225
Number of Articles:2225


In [4]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_articles))
print(len(train_labels))
print(len(validation_articles))
print(len(validation_labels))

1780
1780
1780
445
445


In [5]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'said': 2,
 'mr': 3,
 'would': 4,
 'year': 5,
 'also': 6,
 'people': 7,
 'new': 8,
 'us': 9,
 'one': 10}

In [6]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
print(train_sequences[10])

[2431, 1, 225, 4995, 22, 641, 587, 225, 4995, 1, 1, 1663, 1, 1, 2431, 22, 565, 1, 1, 140, 278, 1, 140, 278, 796, 823, 662, 2307, 1, 1144, 1694, 1, 1722, 4996, 1, 1, 1, 1, 1, 4738, 1, 1, 122, 4514, 1, 2, 2873, 1507, 352, 4739, 1, 52, 341, 1, 352, 2172, 3962, 41, 22, 3795, 1, 1, 1, 1, 543, 1, 1, 1, 835, 631, 2366, 347, 4740, 1, 365, 22, 1, 787, 2367, 1, 4302, 138, 10, 1, 3665, 682, 3532, 1, 22, 1, 414, 823, 662, 1, 90, 13, 633, 1, 225, 4995, 1, 600, 1, 1694, 1021, 1, 4997, 808, 1865, 117, 1, 1, 1, 2974, 22, 1, 99, 278, 1, 1609, 4998, 543, 493, 1, 1444, 4741, 778, 1320, 1, 1862, 10, 33, 641, 319, 1, 62, 479, 565, 301, 1508, 22, 480, 1, 1, 1666, 1, 797, 1, 3066, 1, 1365, 6, 1, 2431, 565, 22, 2971, 4735, 1, 1, 1, 1, 1, 850, 39, 1826, 675, 297, 26, 979, 1, 882, 22, 361, 22, 13, 301, 1508, 1343, 374, 20, 63, 883, 1096, 4303, 247]


In [7]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

426
200
192
200
186
200


In [8]:
print(train_padded[10])

[2431    1  225 4995   22  641  587  225 4995    1    1 1663    1    1
 2431   22  565    1    1  140  278    1  140  278  796  823  662 2307
    1 1144 1694    1 1722 4996    1    1    1    1    1 4738    1    1
  122 4514    1    2 2873 1507  352 4739    1   52  341    1  352 2172
 3962   41   22 3795    1    1    1    1  543    1    1    1  835  631
 2366  347 4740    1  365   22    1  787 2367    1 4302  138   10    1
 3665  682 3532    1   22    1  414  823  662    1   90   13  633    1
  225 4995    1  600    1 1694 1021    1 4997  808 1865  117    1    1
    1 2974   22    1   99  278    1 1609 4998  543  493    1 1444 4741
  778 1320    1 1862   10   33  641  319    1   62  479  565  301 1508
   22  480    1    1 1666    1  797    1 3066    1 1365    6    1 2431
  565   22 2971 4735    1    1    1    1    1  850   39 1826  675  297
   26  979    1  882   22  361   22   13  301 1508 1343  374   20   63
  883 1096 4303  247    0    0    0    0    0    0    0    0    0    0
    0 

In [9]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

445
(445, 200)


In [10]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))
print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

[4]
[2]
[1]
(1780, 1)
[5]
[4]
[3]
(445, 1)


In [11]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('---')
print(train_articles[10])

berlin <OOV> anti nazi film german movie anti nazi <OOV> <OOV> drawn <OOV> <OOV> berlin film festival <OOV> <OOV> final days <OOV> final days member white rose movement <OOV> 21 arrested <OOV> brother hans <OOV> <OOV> <OOV> <OOV> <OOV> tyranny <OOV> <OOV> director marc <OOV> said feeling responsibility keep legacy <OOV> going must <OOV> keep ideas alive added film drew <OOV> <OOV> <OOV> <OOV> trial <OOV> <OOV> <OOV> east germany secret police discovery <OOV> behind film <OOV> worked closely <OOV> relatives including one <OOV> sisters ensure historical <OOV> film <OOV> members white rose <OOV> group first started <OOV> anti nazi <OOV> summer <OOV> arrested dropped <OOV> munich university calling day <OOV> <OOV> <OOV> regime film <OOV> six days <OOV> arrest intense trial saw <OOV> initially deny charges ended <OOV> appearance one three german films <OOV> top prize festival south african film version <OOV> <OOV> opera <OOV> shot <OOV> town <OOV> language also <OOV> berlin festival film en

## Sagemaker

In [12]:
import sagemaker

session = sagemaker.Session()

data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)

train_dir = os.path.join(os.getcwd(), 'data/train')
os.makedirs(train_dir, exist_ok=True)

test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(test_dir, exist_ok=True)

raw_dir = os.path.join(os.getcwd(), 'data/raw')
os.makedirs(raw_dir, exist_ok=True)

In [13]:
from sagemaker import get_execution_role

role = get_execution_role()
print(role)

arn:aws:iam::224566132838:role/service-role/AmazonSageMaker-ExecutionRole-20200310T225046


In [14]:
import numpy as np
np.save(os.path.join(train_dir, 'x_train'), train_padded)
np.save(os.path.join(test_dir, 'x_test'), validation_padded)
np.save(os.path.join(train_dir, 'y_train'), training_label_seq)
np.save(os.path.join(test_dir, 'y_test'), validation_label_seq)
s3_prefix = 'tf-2-workflow'
train_s3_prefix = '{}/data/train'.format(s3_prefix)
test_s3_prefix = '{}/data/test'.format(s3_prefix)
train_s3 = session.upload_data(path='./data/train/', key_prefix=train_s3_prefix)
test_s3 = session.upload_data(path='./data/test/', key_prefix=test_s3_prefix)
print(train_s3)
print(test_s3)

s3://sagemaker-eu-west-1-224566132838/tf-2-workflow/data/train
s3://sagemaker-eu-west-1-224566132838/tf-2-workflow/data/test


In [15]:
inputs = {'train':train_s3, 'test': test_s3}

In [16]:
train_model_s3_prefix = '{}/train_model'.format(s3_prefix)
train_model_s3 = session.upload_data(path='./train_model', key_prefix=train_model_s3_prefix)

In [17]:
from sagemaker.tensorflow import TensorFlow
train_instance_type = 'ml.p2.xlarge'
hyperparameters = {'vocab_size':5000,
                   'embedding_dim':64,
                   'epochs': 15, 
                   'batch_size': 128, 
                   'learning_rate': 0.01
                   }

estimator = TensorFlow(
                       source_dir='train_model/',
                       entry_point='train.py',
                       train_instance_type=train_instance_type,
                       train_instance_count=1,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-2-workflow',
                       framework_version='2.1',
                       py_version='py3',
                       script_mode=True
                        )

In [18]:
estimator.fit(inputs)

2020-04-19 23:39:25 Starting - Starting the training job...
2020-04-19 23:39:26 Starting - Launching requested ML instances...
2020-04-19 23:40:25 Starting - Preparing the instances for training.........
2020-04-19 23:41:39 Downloading - Downloading input data...
2020-04-19 23:42:04 Training - Downloading the training image......
2020-04-19 23:43:13 Training - Training image download completed. Training in progress..[34m2020-04-19 23:43:17,684 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-04-19 23:43:18,110 sagemaker-containers INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "ba

In [19]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c5.xlarge')

ClientError: An error occurred (ValidationException) when calling the CreateModel operation: Could not find model data at s3://sagemaker-eu-west-1-224566132838/tf-2-workflow-2020-04-19-23-39-24-623/output/model.tar.gz.

## Local

In [20]:
!wget -q https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-script-mode/master/local_mode_setup.sh
!wget -q https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-script-mode/master/daemon.json    
!/bin/bash ./local_mode_setup.sh

SageMaker instance route table setup is ok. We are good to go.
SageMaker instance routing for Docker is ok. We are good to go!


In [29]:
from sagemaker.tensorflow import TensorFlow
train_instance_type = 'local'
hyperparameters = {'vocab_size':5000,
                   'embedding_dim':64,
                   'epochs': 2, 
                   'batch_size': 128, 
                   'learning_rate': 0.01
                   }


local_estimator = TensorFlow(
                       source_dir='train_model/',
                       entry_point='train.py',
                       train_instance_type=train_instance_type,
                       train_instance_count=1,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-2-workflow',
                       framework_version='2.1',
                       py_version='py3',
                       script_mode=True
                        )

In [30]:
inputs = {'train': f'file://{train_dir}',
          'test': f'file://{test_dir}'}

local_estimator.fit(inputs)

Creating tmp___aaaor_algo-1-val01_1 ... 
[1BAttaching to tmp___aaaor_algo-1-val01_12mdone[0m
[36malgo-1-val01_1  |[0m 2020-04-20 00:05:33,054 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training
[36malgo-1-val01_1  |[0m 2020-04-20 00:05:33,062 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-val01_1  |[0m 2020-04-20 00:05:33,262 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-val01_1  |[0m 2020-04-20 00:05:33,282 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-val01_1  |[0m 2020-04-20 00:05:33,302 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-val01_1  |[0m 2020-04-20 00:05:33,314 sagemaker-containers INFO     Invoking user script
[36malgo-1-val01_1  |[0m 
[36malgo-1-val01_1  |[0m Training Env:
[36malgo-1-val01_1  |[0m 
[36malgo-1-val01_1  |[0m {
[36malgo-1-val01_1  |[0

RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmp___aaaor/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1

In [None]:
local_predictor = local_estimator.deploy(initial_instance_count=1,instance_type='local')

In [None]:
local_results = local_predictor.predict(x_test[:10])['predictions']

## Model

In [None]:
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

In [None]:
print(set(labels))

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")