In [1]:
!pip install -q allennlp==2.5.0
!pip install -q allennlp-models==2.5.0

[K     |████████████████████████████████| 103 kB 13.3 MB/s 
[K     |████████████████████████████████| 804.1 MB 2.7 kB/s 
[K     |████████████████████████████████| 124 kB 43.1 MB/s 
[K     |████████████████████████████████| 1.8 MB 34.6 MB/s 
[K     |████████████████████████████████| 2.2 MB 32.1 MB/s 
[K     |████████████████████████████████| 22.3 MB 8.5 MB/s 
[K     |████████████████████████████████| 167 kB 47.5 MB/s 
[K     |████████████████████████████████| 8.5 MB 37.0 MB/s 
[K     |████████████████████████████████| 79 kB 6.6 MB/s 
[K     |████████████████████████████████| 138 kB 51.5 MB/s 
[K     |████████████████████████████████| 75 kB 4.3 MB/s 
[K     |████████████████████████████████| 127 kB 54.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 42.6 MB/s 
[K     |████████████████████████████████| 895 kB 45.6 MB/s 
[K     |████████████████████████████████| 97 kB 7.1 MB/s 
[K     |████████████████████████████████| 180 kB 47.6 MB/s 
[K     |████████████████████

In [24]:
!git clone https://github.com/mhagiwara/realworldnlp.git

Cloning into 'realworldnlp'...
remote: Enumerating objects: 668, done.[K
remote: Counting objects: 100% (189/189), done.[K
remote: Compressing objects: 100% (135/135), done.[K
remote: Total 668 (delta 119), reused 116 (delta 52), pack-reused 479[K
Receiving objects: 100% (668/668), 4.95 MiB | 10.81 MiB/s, done.
Resolving deltas: 100% (370/370), done.


In [29]:
!pwd

/content


In [30]:
from itertools import chain
from typing import Dict
import os
import urllib.request
import zipfile
import numpy as np
import torch
import torch.optim as optim
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training import GradientDescentTrainer
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import StanfordSentimentTreeBankDatasetReader
from realworldnlp.realworldnlp.predictors import SentenceClassifierPredictor

In [3]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

In [4]:
class LstmClassifier(Model):
  def __init__(self,
               embedder: TextFieldEmbedder,
               encoder: Seq2VecEncoder,
               vocab: Vocabulary,
               positive_label: str = '4') -> None:
    super().__init__(vocab)
    self.embedder = embedder
    self.encoder = encoder
    self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels'))

    positive_index = vocab.get_token_index(positive_label, namespace='labels')
    self.accuracy = CategoricalAccuracy()
    self.f1_measure = F1Measure(positive_index)                            
    self.loss_function = torch.nn.CrossEntropyLoss()

  def forward(self,
              tokens: Dict[str, torch.Tensor],
              label: torch.Tensor = None) -> torch.Tensor:
    mask = get_text_field_mask(tokens)
    embeddings = self.embedder(tokens)
    encoder_out = self.encoder(embeddings, mask)
    logits = self.linear(encoder_out)

    output = {'logits':logits}
    if label is not None:
      self.accuracy(logits, label)
      self.f1_measure(logits, label)
      output['loss'] = self.loss_function(logits, label)

    return output

  def get_metrics(self, reset:bool = False) -> Dict[str, float]:
    return {'accuracy': self.accuracy.get_metric(reset), **self.f1_measure.get_metric(reset)}

In [8]:
url = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'

zip_path, _ = urllib.request.urlretrieve(url)
with zipfile.ZipFile(zip_path, "r") as f:
    f.extractall()

In [79]:
reader = StanfordSentimentTreeBankDatasetReader()
data_path = r'./trees'
train_file = 'train.txt'
test_file = 'test.txt'
dev_file = 'dev.txt'

train_path = os.path.join(data_path, train_file)
dev_path = os.path.join(data_path, dev_file)

In [60]:
reader = StanfordSentimentTreeBankDatasetReader()
train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt'
dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt'

In [80]:
sampler = BucketBatchSampler(batch_size=32, sorting_keys=['tokens'])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)

loading instances: 8544it [00:01, 4320.58it/s]
loading instances: 1101it [00:00, 1888.96it/s]


In [81]:
vocab = Vocabulary.from_instances(
    chain(
        train_data_loader.iter_instances(),
        dev_data_loader.iter_instances()
    ),
    min_count={'tokens':3}
)

building vocab: 9645it [00:00, 47965.11it/s]


In [82]:
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)

In [83]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({'tokens': token_embedding})

In [84]:
encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmClassifier(embedder=word_embeddings, 
                       encoder=encoder, 
                       vocab=vocab, 
                       positive_label='4')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

In [85]:
trainer = GradientDescentTrainer(
    model = model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=20,
    device=device
)

trainer.train()

accuracy: 0.2603, precision: 0.0000, recall: 0.0000, f1: 0.0000, batch_loss: 1.6158, loss: 1.5835 ||: 100%|##########| 267/267 [00:03<00:00, 73.13it/s]
accuracy: 0.2534, precision: 0.0000, recall: 0.0000, f1: 0.0000, batch_loss: 1.6235, loss: 1.5740 ||: 100%|##########| 35/35 [00:00<00:00, 145.70it/s]
accuracy: 0.2726, precision: 0.0000, recall: 0.0000, f1: 0.0000, batch_loss: 1.6199, loss: 1.5651 ||: 100%|##########| 267/267 [00:03<00:00, 77.25it/s]
accuracy: 0.2652, precision: 0.0000, recall: 0.0000, f1: 0.0000, batch_loss: 1.5839, loss: 1.5714 ||: 100%|##########| 35/35 [00:00<00:00, 151.23it/s]
accuracy: 0.2808, precision: 0.0000, recall: 0.0000, f1: 0.0000, batch_loss: 1.5079, loss: 1.5556 ||: 100%|##########| 267/267 [00:03<00:00, 76.86it/s]
accuracy: 0.2552, precision: 0.0000, recall: 0.0000, f1: 0.0000, batch_loss: 1.5743, loss: 1.5677 ||: 100%|##########| 35/35 [00:00<00:00, 150.94it/s]
accuracy: 0.2952, precision: 0.4706, recall: 0.0062, f1: 0.0123, batch_loss: 1.5487, loss: 

{'best_epoch': 6,
 'best_validation_accuracy': 0.3533151680290645,
 'best_validation_f1': 0.3931204080581665,
 'best_validation_loss': 1.4907133204596383,
 'best_validation_precision': 0.3305785059928894,
 'best_validation_recall': 0.4848484992980957,
 'epoch': 16,
 'peak_gpu_0_memory_MB': 38.533203125,
 'peak_worker_0_memory_MB': 2488.27734375,
 'training_accuracy': 0.7971676029962547,
 'training_duration': '0:01:03.383364',
 'training_f1': 0.8820435404777527,
 'training_gpu_0_memory_MB': 38.533203125,
 'training_loss': 0.5498426571655809,
 'training_precision': 0.8544396162033081,
 'training_recall': 0.9114906787872314,
 'training_worker_0_memory_MB': 2488.27734375,
 'validation_accuracy': 0.35603996366939145,
 'validation_f1': 0.39053255319595337,
 'validation_loss': 2.3423965692520143,
 'validation_precision': 0.3815028965473175,
 'validation_recall': 0.4000000059604645}

In [89]:
# Labels are:
# 0: very negative
# 1: negative
# 2: neutral
# 3: positive
# 4: very positive

def predict_label(sentence: str) -> int:
  predictor = SentenceClassifierPredictor(model, reader)
  logits = predictor.predict(sentence)['logits']
  label_id = np.argmax(logits)

  return vocab.get_token_from_index(label_id, 'labels')
  
to_predict = 'This is the best movie ever'
print(f'Label is: {predict_label(to_predict)}')

Label is: 4
