### Assignment 1 Text Classification
---

In [1]:
!wget http://phontron.com/data/topicclass-v1.tar.gz

--2020-02-02 12:09:04--  http://phontron.com/data/topicclass-v1.tar.gz
Resolving phontron.com (phontron.com)... 208.113.196.149
Connecting to phontron.com (phontron.com)|208.113.196.149|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15665160 (15M) [application/gzip]
Saving to: ‘topicclass-v1.tar.gz’


2020-02-02 12:09:15 (1.65 MB/s) - ‘topicclass-v1.tar.gz’ saved [15665160/15665160]



In [2]:
!tar -xvzf topicclass-v1.tar.gz topicclass

x topicclass/
x topicclass/topicclass_valid.txt
x topicclass/topicclass_test.txt
x topicclass/topicclass_train.txt


In [5]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

In [6]:
class PosDatasetReader(DatasetReader):
    
    def __init__(self, token_indexers=None):
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        
    def text_to_instance(self, tokens, label):
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"sentence": sentence_field}
        
        if label:
            label_field = LabelField(label=label)
            fields["label"] = label_field
            
        return Instance(fields)
    
    def _read(self, file_path):
        with open(file_path) as f:
            for line in f:
                label, sent = line.strip().split("|||")
                sent, label = sent.strip(), label.strip()
                yield self.text_to_instance([Token(word) for word in sent.split(" ")], label)

In [7]:
class LstmTagger(Model):
    def __init__(self, word_embeddings, encoder, vocab):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.maxpool = torch.nn.AdaptiveMaxPool1d(1)
        self.out = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                   out_features=vocab.get_vocab_size("labels"))
        self.accuracy = CategoricalAccuracy()
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, sentence, label):
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        encoder_out = encoder_out.transpose(1,2)
        out = self.maxpool(encoder_out).transpose(1,2).squeeze() # max pool across time
        out = self.out(out)
        output = {"logits": out}
        
#         print("---- out ----")
#         print(out)
        
#         print("---- label ----")
#         print(label)
        if label is not None:
            self.accuracy(out, label)
            output["loss"] = self.criterion(out, label)
        return output
    
    def get_metrics(self, reset):
        return {"accuracy": self.accuracy.get_metric(reset)}

In [8]:
reader = PosDatasetReader()

In [9]:
TRAIN = "topicclass/topicclass_train.txt"
VALID = "topicclass/topicclass_valid.txt"
TEST = "topicclass/topicclass_test.txt"

train_dataset = reader.read(cached_path(TRAIN))
valid_dataset = reader.read(cached_path(VALID))
test_dataset = reader.read(cached_path(TEST))

253909it [00:17, 14487.33it/s]
643it [00:00, 25154.48it/s]
697it [00:00, 17998.64it/s]


In [10]:
valid_dataset[0].__dict__

{'fields': {'sentence': <allennlp.data.fields.text_field.TextField at 0x1aa3a7e3c8>,
  'label': <allennlp.data.fields.label_field.LabelField at 0x1aa3a7eb38>},
 'indexed': False}

In [11]:
valid_dataset[0]["label"].__dict__

{'label': 'Sports and recreation',
 '_label_namespace': 'labels',
 '_label_id': None}

In [12]:
valid_dataset[0].fields["sentence"].__dict__

{'tokens': [The,
  Māori,
  players,
  initially,
  provoked,
  curiosity,
  due,
  to,
  their,
  race,
  ,,
  but,
  the,
  British,
  press,
  subsequently,
  expressed,
  some,
  surprise,
  that,
  the,
  side,
  was,
  not,
  as,
  ",
  Māori,
  ",
  as,
  they,
  had,
  expected,
  .],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x11f066cf8>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

In [13]:
vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

02/02/2020 10:25:44 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.
100%|██████████| 254552/254552 [00:04<00:00, 55615.87it/s]


In [14]:
vocab.print_statistics()

02/02/2020 10:25:48 - INFO - allennlp.data.vocabulary -   Printed vocabulary statistics are only for the part of the vocabulary generated from instances. If vocabulary is constructed by extending saved vocabulary with dataset instances, the directly loaded portion won't be considered here.




----Vocabulary Statistics----


Top 10 most frequent tokens in namespace 'tokens':
	Token: the		Frequency: 379873
	Token: ,		Frequency: 329376
	Token: .		Frequency: 254552
	Token: of		Frequency: 195666
	Token: and		Frequency: 188073
	Token: in		Frequency: 156502
	Token: a		Frequency: 115405
	Token: to		Frequency: 105341
	Token: was		Frequency: 84184
	Token: The		Frequency: 67019

Top 10 longest tokens in namespace 'tokens':
	Token: 71828182845904523536028747135266249775724709369995		length: 50	Frequency: 1
	Token: GlennBeckRapedAndMurderedAYoungGirlIn1990.com		length: 45	Frequency: 1
	Token: Andriantsimitoviaminandriandrazaka		length: 34	Frequency: 1
	Token: Southernplayalisticadillacmuzik		length: 31	Frequency: 4
	Token: Andriantsimitoviaminiandriana		length: 29	Frequency: 5
	Token: Kollektivtransportproduksjon		length: 28	Frequency: 5
	Token: 113423713055421844361000443		length: 27	Frequency: 1
	Token: Landesversicherungsanstalt		length: 26	Frequency: 2
	Token: Rabodoandrianampoini

In [15]:
EMBED_DIM = 300
HIDDEN_DIM = 300

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"),
                            embedding_dim=EMBED_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [16]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBED_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(word_embeddings, lstm, vocab)

In [17]:
if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
optimizer = optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=16, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)

In [18]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=valid_dataset,
                  validation_dataset=valid_dataset,
                  patience=5,
                  num_epochs=9,
                  cuda_device=cuda_device)
trainer.train()

02/02/2020 10:25:49 - INFO - allennlp.training.trainer -   Beginning training.
02/02/2020 10:25:49 - INFO - allennlp.training.trainer -   Epoch 0/8
02/02/2020 10:25:49 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 2547.073024
02/02/2020 10:25:49 - INFO - allennlp.training.trainer -   Training
accuracy: 0.1120, loss: 2.5567 ||: 100%|██████████| 41/41 [00:19<00:00,  2.22it/s]
02/02/2020 10:26:09 - INFO - allennlp.training.trainer -   Validating
accuracy: 0.1291, loss: 2.4158 ||: 100%|██████████| 41/41 [00:00<00:00, 67.78it/s]
02/02/2020 10:26:09 - INFO - allennlp.training.trainer -                     Training |  Validation
02/02/2020 10:26:09 - INFO - allennlp.training.trainer -   cpu_memory_MB |  2547.073  |       N/A
02/02/2020 10:26:09 - INFO - allennlp.training.trainer -   accuracy      |     0.112  |     0.129
02/02/2020 10:26:09 - INFO - allennlp.training.trainer -   loss          |     2.557  |     2.416
02/02/2020 10:26:09 - INFO - allennlp.training.trainer - 

{'peak_cpu_memory_MB': 2706.026496,
 'training_duration': '00:03:12',
 'training_start_epoch': 0,
 'training_epochs': 8,
 'epoch': 8,
 'training_accuracy': 0.9704510108864697,
 'training_loss': 0.14491085235665485,
 'training_cpu_memory_MB': 2706.026496,
 'validation_accuracy': 0.9797822706065319,
 'validation_loss': 0.08918036529567183,
 'best_epoch': 8,
 'best_validation_accuracy': 0.9797822706065319,
 'best_validation_loss': 0.08918036529567183}

In [19]:
vocab.get_vocab_size("labels")

17