In [2]:
from itertools import chain
from typing import Dict
 
import numpy as np
import torch
import torch.optim as optim
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training import GradientDescentTrainer
from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import StanfordSentimentTreeBankDatasetReader



In [8]:
reader = StanfordSentimentTreeBankDatasetReader()
 
train_dataset = reader.read('data/train.txt')
dev_dataset = reader.read('data/dev.txt')
 
for inst in chain(train_dataset, dev_dataset):
    print(inst)

Instance with fields:
 	 tokens: TextField of length 36 with text: 
 		[The, Rock, is, destined, to, be, the, 21st, Century, 's, new, ``, Conan, '', and, that, he, 's,
		going, to, make, a, splash, even, greater, than, Arnold, Schwarzenegger, ,, Jean-Claud, Van, Damme,
		or, Steven, Segal, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 label: LabelField with label: 3 in namespace: 'labels'. 

Instance with fields:
 	 tokens: TextField of length 37 with text: 
 		[The, gorgeously, elaborate, continuation, of, ``, The, Lord, of, the, Rings, '', trilogy, is, so,
		huge, that, a, column, of, words, can, not, adequately, describe, co-writer\/director, Peter,
		Jackson, 's, expanded, vision, of, J.R.R., Tolkien, 's, Middle-earth, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 label: LabelField with label: 4 in namespace: 'labels'. 

Instance with fields:
 	 tokens: TextField of length 39 with text: 
 		[Singer\/composer, Bryan, Adams, contributes, a, slew,

In [7]:
train_path = 'data/train.txt'
dev_path = 'data/dev.txt'

sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(
    reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(
    reader, dev_path, batch_sampler=sampler)


loading instances: 0it [00:00, ?it/s]

loading instances: 0it [00:00, ?it/s]

In [9]:
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), dev_data_loader.iter_instances()),
                                  min_count={'tokens': 3})


building vocab: 0it [00:00, ?it/s]

In [13]:
EMBEDDING_DIM=128

token_embedding = Embedding(
    num_embeddings=vocab.get_vocab_size('tokens'),
    embedding_dim=EMBEDDING_DIM)

In [14]:
HIDDEN_DIM=128

encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [12]:
@Model.register("lstm_classifier")
class LstmClassifier(Model):
    def __init__(self,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary,
                 positive_label: str = '4') -> None:
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder

        self.linear = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels'))

        positive_index = vocab.get_token_index(
            positive_label, namespace='labels')
        self.accuracy = CategoricalAccuracy()
        self.f1_measure = F1Measure(positive_index)

        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(tokens)

        embeddings = self.embedder(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)

        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            self.f1_measure(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {'accuracy': self.accuracy.get_metric(reset),
                **self.f1_measure.get_metric(reset)}


In [18]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
 
reader = StanfordSentimentTreeBankDatasetReader()
 
train_path = 'data/train.txt'
dev_path = 'data/dev.txt'
 
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(
    reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(
    reader, dev_path, batch_sampler=sampler)
 
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), 
                                        dev_data_loader.iter_instances()),
                                  min_count={'tokens': 3})
 
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)
 
token_embedding = Embedding(
    num_embeddings=vocab.get_vocab_size('tokens'),
    embedding_dim=EMBEDDING_DIM)
 
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
 
encoder = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
 
model = LstmClassifier(word_embeddings, encoder, vocab)
model.to(torch.device('cuda'))
 
optimizer = optim.Adam(model.parameters())


loading instances: 0it [00:00, ?it/s]

loading instances: 0it [00:00, ?it/s]

building vocab: 0it [00:00, ?it/s]

In [19]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=20,
    cuda_device=0)
 
trainer.train()

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

{'best_epoch': 1,
 'peak_worker_0_memory_MB': 5611.02734375,
 'peak_gpu_0_memory_MB': 25.67041015625,
 'training_duration': '0:00:33.287449',
 'epoch': 11,
 'training_accuracy': 0.9405430711610487,
 'training_precision': 0.9643140435218811,
 'training_recall': 0.9440993666648865,
 'training_f1': 0.9540995955467224,
 'training_loss': 0.17581858462043637,
 'training_worker_0_memory_MB': 5611.02734375,
 'training_gpu_0_memory_MB': 25.66259765625,
 'validation_accuracy': 0.3542234332425068,
 'validation_precision': 0.35211268067359924,
 'validation_recall': 0.4545454680919647,
 'validation_f1': 0.3968254029750824,
 'validation_loss': 3.6808590275900706,
 'best_validation_accuracy': 0.3814713896457766,
 'best_validation_precision': 0.43918919563293457,
 'best_validation_recall': 0.39393940567970276,
 'best_validation_f1': 0.415335476398468,
 'best_validation_loss': 1.3965405327933176}

In [28]:
import os

out_dir = 'models/sent-classifier'
os.makedirs(out_dir, exist_ok=True)
path = os.path.join(out_dir)

with open(path + '/model.th', 'wb') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files(path + '/vocab')