In [None]:
# Sentiment analyzer (5-class). 0 = strongly negative, 1 = negative, 2 = neutral, 3 = positive 4 = strongly positive

%cd /home/infili/translation/DimPapSandbox/realworldnlp

In [9]:
# Loading imports

import numpy as np
import torch
import torch.optim as optim
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import StanfordSentimentTreeBankDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.training.trainer import Trainer

from examples.sentiment.sst_classifier import LstmClassifier
from realworldnlp.predictors import SentenceClassifierPredictor


In [4]:
#Model definition and training

HIDDEN_DIM = 512
CUDA_DEVICE = 0


# In order to use ELMo, each word in a sentence needs to be indexed with
# an array of character IDs.
elmo_token_indexer = ELMoTokenCharactersIndexer()
reader = StanfordSentimentTreeBankDatasetReader(
    token_indexers={'tokens': elmo_token_indexer})

train_dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt')
dev_dataset = reader.read('https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt')

# Initialize the ELMo-based token embedder using a pre-trained file.
# This takes a while if you run this script for the first time

# Original
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

# Medium
# options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
# weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"

# Use the 'Small' pre-trained model
# options_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
#                '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json')
# weight_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
#               '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                  min_count={'tokens': 3})

# Pass in the ElmoTokenEmbedder instance instead
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

# The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
elmo_embedding_dim = 1024
lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM , batch_first=True))

model = LstmClassifier(word_embeddings, lstm, vocab)
optimizer = optim.AdamW(model.parameters())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=dev_dataset,
                  patience= 1,
                  num_epochs=20,
                  cuda_device= CUDA_DEVICE)

trainer.train()


8544it [00:01, 5528.64it/s]
1101it [00:00, 1673.59it/s]
100%|██████████| 9645/9645 [00:00<00:00, 442541.67it/s]
accuracy: 0.3976, precision: 0.4757, recall: 0.2888, f1_measure: 0.3594, loss: 1.3558 ||: 100%|██████████| 267/267 [00:15<00:00, 16.98it/s]
accuracy: 0.4432, precision: 0.5134, recall: 0.5818, f1_measure: 0.5455, loss: 1.2469 ||: 100%|██████████| 35/35 [00:01<00:00, 18.94it/s]
accuracy: 0.4815, precision: 0.5830, recall: 0.4309, f1_measure: 0.4955, loss: 1.1936 ||: 100%|██████████| 267/267 [00:14<00:00, 17.94it/s]
accuracy: 0.4714, precision: 0.5297, recall: 0.5939, f1_measure: 0.5600, loss: 1.2219 ||: 100%|██████████| 35/35 [00:01<00:00, 20.04it/s]
accuracy: 0.5195, precision: 0.6356, recall: 0.4821, f1_measure: 0.5483, loss: 1.0983 ||: 100%|██████████| 267/267 [00:14<00:00, 18.39it/s]
accuracy: 0.4886, precision: 0.6667, recall: 0.3273, f1_measure: 0.4390, loss: 1.2160 ||: 100%|██████████| 35/35 [00:01<00:00, 19.97it/s]
accuracy: 0.5671, precision: 0.6593, recall: 0.5543, f

{'best_epoch': 3,
 'peak_cpu_memory_MB': 3687.732,
 'peak_gpu_0_memory_MB': 3327,
 'training_duration': '0:01:07.114216',
 'training_start_epoch': 0,
 'training_epochs': 3,
 'epoch': 3,
 'training_accuracy': 0.567064606741573,
 'training_precision': 0.6592797636985779,
 'training_recall': 0.554347813129425,
 'training_f1_measure': 0.6022775173187256,
 'training_loss': 1.0019111329696597,
 'training_cpu_memory_MB': 3687.732,
 'training_gpu_0_memory_MB': 3327,
 'validation_accuracy': 0.47411444141689374,
 'validation_precision': 0.5769230723381042,
 'validation_recall': 0.5454545617103577,
 'validation_f1_measure': 0.5607476830482483,
 'validation_loss': 1.2065806065286908,
 'best_validation_accuracy': 0.47411444141689374,
 'best_validation_precision': 0.5769230723381042,
 'best_validation_recall': 0.5454545617103577,
 'best_validation_f1_measure': 0.5607476830482483,
 'best_validation_loss': 1.2065806065286908}

In [None]:
#Save model (change filename if needed)

with open("/home/infili/translation/DimPapSandbox/sentiment_modelxx", 'wb') as f:
    torch.save(model.state_dict(), f)
vocab.save_to_files("/home/infili/translation/DimPapSandbox/sentiment_vocabxx")

print("Model saved. DONE")

In [10]:
# Reload pretrained model

%cd /home/infili/translation/DimPapSandbox/realworldnlp

import numpy as np
import torch
import torch.optim as optim
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import StanfordSentimentTreeBankDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.training.trainer import Trainer

from examples.sentiment.sst_classifier import LstmClassifier
from realworldnlp.predictors import SentenceClassifierPredictor

HIDDEN_DIM = 512
CUDA_DEVICE = 0

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo_embedding_dim = 1024
vocab = Vocabulary.from_files("/home/infili/translation/DimPapSandbox/sentiment_vocab1")


lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM , batch_first=True))
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
model = LstmClassifier(word_embeddings, lstm, vocab)

# Reload the trained model.
with open("/home/infili/translation/DimPapSandbox/sentiment_model1", 'rb') as f:
    model.load_state_dict(torch.load(f))
    model.eval()
    
print(model)

/home/infili/translation/DimPapSandbox/realworldnlp
LstmClassifier(
  (embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): ElmoTokenEmbedder(
      (_elmo): Elmo(
        (_elmo_lstm): _ElmoBiLm(
          (_token_embedder): _ElmoCharacterEncoder(
            (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
            (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
            (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
            (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
            (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
            (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
            (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
            (_highways): Highway(
              (_layers): ModuleList(
                (0): Linear(in_features=2048, out_features=4096, bias=True)
                (1): Linear(in_features=2048, out_features=4096, bias=True)
 

In [14]:
# Perform predictions

elmo_token_indexer = ELMoTokenCharactersIndexer()
reader = StanfordSentimentTreeBankDatasetReader(
    token_indexers={'tokens': elmo_token_indexer})

tokens = 'Great... another day of my life wasted to eat here.'
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict(tokens)['logits']
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, 'labels'))

0
