# Training Word2Vec Model parallely using BlazingText on Sagemaker

In [1]:
import sagemaker
from sagemaker import get_execution_role
import boto3
import json
import time

#### Determing the training data loacation and model output location on S3

In [2]:
role = get_execution_role()
sess = sagemaker.Session()
bucket = 'medium-text'
train_folder = 'train'
model_folder = 'model'
s3_train_data = 's3://{}/{}'.format(bucket, train_folder)
s3_output_location = 's3://{}/{}'.format(bucket, model_folder)

#### Create a container in the same region with the training data

In [3]:
region_name = boto3.Session().region_name
print(region_name)

us-east-1


In [4]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


#### Define the training resource should be allocated to this training session. Here I chose 4 ml.c3.2xlarge instances for the job.

In [5]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=4, 
                                         train_instance_type='ml.c4.2xlarge',
                                         train_volume_size = 5,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

#### Set the hyper-parameters for the model and start running

In [6]:
bt_model.set_hyperparameters(mode="batch_skipgram",
                             epochs=5,
                             min_count=5,
                             sampling_threshold=0.0001,
                             learning_rate=0.05,
                             window_size=8,
                             vector_dim=300,
                             negative_samples=5,
                             batch_size=17, #  = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)
                             evaluation=False,
                             subwords=False) 

In [7]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data}

In [8]:
starttime = time.time()
#start time
bt_model.fit(inputs=data_channels, logs=True)

#end time
print('Processing time {} seconds'.format(time.time() - starttime))

2019-11-28 20:59:00 Starting - Starting the training job...
2019-11-28 20:59:01 Starting - Launching requested ML instances......
2019-11-28 21:00:07 Starting - Preparing the instances for training...
2019-11-28 21:00:59 Downloading - Downloading input data...
2019-11-28 21:01:15 Training - Downloading the training image.[33mArguments: train[0m
[32mArguments: train[0m
[34mArguments: train[0m
[32mFound 10.0.77.119 for host algo-1[0m
[32mFound 10.0.85.218 for host algo-2[0m
[32mFound 10.0.117.112 for host algo-3[0m
[32mFound 10.0.76.183 for host algo-4[0m
[33mFound 10.0.77.119 for host algo-1[0m
[33mFound 10.0.85.218 for host algo-2[0m
[33mFound 10.0.117.112 for host algo-3[0m
[33mFound 10.0.76.183 for host algo-4[0m
[34mFound 10.0.77.119 for host algo-1[0m
[34mFound 10.0.85.218 for host algo-2[0m
[34mFound 10.0.117.112 for host algo-3[0m
[34mFound 10.0.76.183 for host algo-4[0m
[31mArguments: train[0m
[31mFound 10.0.77.119 for host algo-1[0m
[31mFound 