### Assignment 1 Text Classification
---

In [None]:
!wget http://phontron.com/data/topicclass-v1.tar.gz

--2020-02-02 18:26:20--  http://phontron.com/data/topicclass-v1.tar.gz
Resolving phontron.com (phontron.com)... 208.113.196.149
Connecting to phontron.com (phontron.com)|208.113.196.149|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15665160 (15M) [application/gzip]
Saving to: ‘topicclass-v1.tar.gz’


2020-02-02 18:26:21 (43.6 MB/s) - ‘topicclass-v1.tar.gz’ saved [15665160/15665160]



In [None]:
!tar -xvzf topicclass-v1.tar.gz topicclass

topicclass/
topicclass/topicclass_valid.txt
topicclass/topicclass_test.txt
topicclass/topicclass_train.txt


In [None]:
!pip3 install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/bb/bb/041115d8bad1447080e5d1e30097c95e4b66e36074277afce8620a61cee3/allennlp-0.9.0-py3-none-any.whl (7.6MB)
[K     |████████████████████████████████| 7.6MB 3.4MB/s 
[?25hCollecting flaky
  Downloading https://files.pythonhosted.org/packages/fe/12/0f169abf1aa07c7edef4855cca53703d2e6b7ecbded7829588ac7e7e3424/flaky-3.6.1-py2.py3-none-any.whl
Collecting parsimonious>=0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/02/fc/067a3f89869a41009e1a7cdfb14725f8ddd246f30f63c645e8ef8a1c56f4/parsimonious-0.8.1.tar.gz (45kB)
[K     |████████████████████████████████| 51kB 9.0MB/s 
[?25hCollecting flask-cors>=3.0.7
  Downloading https://files.pythonhosted.org/packages/78/38/e68b11daa5d613e3a91e4bf3da76c94ac9ee0d9cd515af9c1ab80d36f709/Flask_Cors-3.0.8-py2.py3-none-any.whl
Collecting word2number>=1.1
  Downloading https://files.pythonhosted.org/packages/4a/29/a31940c848521f0725f0df6b25dca8917f13a2025b0e8fcb

In [1]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor, TextClassifierPredictor
from allennlp.training.checkpointer import Checkpointer

In [2]:
class PosDatasetReader(DatasetReader):
    
    def __init__(self, token_indexers=None):
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        
    def text_to_instance(self, tokens, label=None):
        if isinstance(tokens[0], str):
            tokens = map(Token, tokens)
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"sentence": sentence_field}
        
        if label:
            label_field = LabelField(label=label)
            fields["label"] = label_field
            
        return Instance(fields)
    
    def _read(self, file_path):
        with open(file_path) as f:
            for line in f:
                label, sent = line.strip().split("|||")
                sent, label = sent.strip(), label.strip()
                if label == "UNK":
                    label = None
                yield self.text_to_instance([Token(word) for word in sent.split(" ")], label)


In [3]:
class LstmTagger(Model):
    def __init__(self, word_embeddings, encoder, vocab):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.maxpool = torch.nn.AdaptiveMaxPool1d(1)
        self.out = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                   out_features=vocab.get_vocab_size("labels"))
        self.accuracy = CategoricalAccuracy()
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, sentence, label=None):
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        encoder_out = encoder_out.transpose(1,2)
        out = self.maxpool(encoder_out).transpose(1,2).squeeze() # max pool across time
        out = self.out(out)
        output = {"logits": out}

        if label is not None:
            self.accuracy(out, label)
            output["loss"] = self.criterion(out, label)
        return output
    
    def get_metrics(self, reset):
        return {"accuracy": self.accuracy.get_metric(reset)}

In [4]:
reader = PosDatasetReader()

In [5]:
TRAIN = "topicclass/topicclass_train.txt"
VALID = "topicclass/topicclass_valid.txt"
TEST = "topicclass/topicclass_test.txt"

train_dataset = reader.read(cached_path(TRAIN))
valid_dataset = reader.read(cached_path(VALID))
test_dataset = reader.read(cached_path(TEST))

253909it [00:20, 12606.28it/s]
643it [00:00, 14982.07it/s]
697it [00:00, 13994.26it/s]


In [6]:
test_dataset[0].__dict__

{'fields': {'sentence': <allennlp.data.fields.text_field.TextField at 0x1a89b8b0f0>},
 'indexed': False}

In [8]:
valid_dataset[0].fields["sentence"].__dict__

{'tokens': [The,
  Māori,
  players,
  initially,
  provoked,
  curiosity,
  due,
  to,
  their,
  race,
  ,,
  but,
  the,
  British,
  press,
  subsequently,
  expressed,
  some,
  surprise,
  that,
  the,
  side,
  was,
  not,
  as,
  ",
  Māori,
  ",
  as,
  they,
  had,
  expected,
  .],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x1a326bb8d0>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None,
 '_token_index_to_indexer_name': None}

In [9]:
vocab = Vocabulary.from_instances(train_dataset + valid_dataset)

100%|██████████| 254552/254552 [00:07<00:00, 32282.07it/s]


In [13]:
vocab.print_statistics()



----Vocabulary Statistics----


Top 10 most frequent tokens in namespace 'tokens':
	Token: the		Frequency: 379873
	Token: ,		Frequency: 329376
	Token: .		Frequency: 254552
	Token: of		Frequency: 195666
	Token: and		Frequency: 188073
	Token: in		Frequency: 156502
	Token: a		Frequency: 115405
	Token: to		Frequency: 105341
	Token: was		Frequency: 84184
	Token: The		Frequency: 67019

Top 10 longest tokens in namespace 'tokens':
	Token: 71828182845904523536028747135266249775724709369995		length: 50	Frequency: 1
	Token: GlennBeckRapedAndMurderedAYoungGirlIn1990.com		length: 45	Frequency: 1
	Token: Andriantsimitoviaminandriandrazaka		length: 34	Frequency: 1
	Token: Southernplayalisticadillacmuzik		length: 31	Frequency: 4
	Token: Andriantsimitoviaminiandriana		length: 29	Frequency: 5
	Token: Kollektivtransportproduksjon		length: 28	Frequency: 5
	Token: 113423713055421844361000443		length: 27	Frequency: 1
	Token: Landesversicherungsanstalt		length: 26	Frequency: 2
	Token: Rabodoandrianampoini

In [14]:
EMBED_DIM = 300
HIDDEN_DIM = 300

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"),
                            embedding_dim=EMBED_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [15]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBED_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(word_embeddings, lstm, vocab)

In [16]:
if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
optimizer = optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=64, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)
checkpointer = Checkpointer("./checkpoints/ckpt-1", 30)

In [35]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=valid_dataset,
                  patience=5,
                  num_epochs=3,
                  checkpointer=checkpointer,
                  cuda_device=cuda_device)
trainer.train()

{'best_epoch': 0,
 'best_validation_accuracy': 0.8040435458786936,
 'best_validation_loss': 0.92332116717642}

In [17]:
from typing import Iterator, List, Dict

from allennlp.data.iterators import DataIterator
from tqdm import tqdm
from scipy.special import expit # the sigmoid function
 
def tonp(tsr): return tsr.detach().cpu().numpy()
 
class Predictor:
    def __init__(self, model: Model, iterator: DataIterator,
                 cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
         
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return expit(tonp(out_dict["logits"]))
     
    def predict(self, ds) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

In [18]:
from allennlp.nn import util as nn_util

In [None]:
predictor = TextClassifierPredictor(model, reader)
logits = predictor.predict(test_dataset)

#### TCN
---

In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import weight_norm

class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size

    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()
  

class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super(TemporalBlock, self).__init__()
        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp1 = Chomp1d(padding)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)

        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
        self.init_weights()


    def init_weights(self):
        self.conv1.weight.data.normal_(0, 0.01)
        self.conv2.weight.data.normal_(0, 0.01)
        if self.downsample is not None:
            self.downsample.weight.data.normal_(0, 0.01)

    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)


class TemporalConvNet(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super(TemporalConvNet, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
                                     padding=(kernel_size-1) * dilation_size, dropout=dropout)]

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)


In [41]:
class TCNClassifier(Model):
    def __init__(self, word_embeddings, vocab, dropout=0.5):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = TemporalConvNet(EMBED_DIM, [HIDDEN_DIM] * N_LAYERS, 
                                       kernel_size=KERNEL_SIZE, 
                                       dropout=dropout)
        self.maxpool = torch.nn.AdaptiveMaxPool1d(1)
        self.out = torch.nn.Linear(in_features=HIDDEN_DIM,
                                   out_features=vocab.get_vocab_size("labels"))
        self.accuracy = CategoricalAccuracy()
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, sentence, label):
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings.transpose(1,2))
        out = self.maxpool(encoder_out).transpose(1,2).squeeze() # max pool across time
        out = self.out(out)
        output = {"logits": out}

        if label is not None:
            self.accuracy(out, label)
            output["loss"] = self.criterion(out, label)
        return output
    
    def get_metrics(self, reset):
        return {"accuracy": self.accuracy.get_metric(reset)}

In [42]:
from allennlp.training.learning_rate_schedulers import learning_rate_scheduler
from torch.optim.lr_scheduler import MultiStepLR
from allennlp.training.checkpointer import Checkpointer

In [None]:
EMBED_DIM = 300
HIDDEN_DIM = 300
N_LAYERS = 3
KERNEL_SIZE = 5
WD = 0
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"),
                            embedding_dim=EMBED_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

model = TCNClassifier(word_embeddings, vocab)

if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=WD)
iterator = BucketIterator(batch_size=64, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)
checkpointer = Checkpointer("./checkpoints/ckpt-2", 30)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=valid_dataset,
                  patience=5,
                  num_epochs=3,
                  grad_clipping=1.0,
                  cuda_device=cuda_device,
                  checkpointer=checkpointer)
trainer.train()

In [14]:
EMBED_DIM = 300
HIDDEN_DIM = 100
N_LAYERS = 5
KERNEL_SIZE = 3
WD = 0
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"),
                            embedding_dim=EMBED_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

model = TCNClassifier(word_embeddings, vocab)

if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=WD)
iterator = BucketIterator(batch_size=64, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)
checkpointer = Checkpointer("./checkpoints/ckpt-3", 30)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=valid_dataset,
                  patience=5,
                  num_epochs=3,
                  grad_clipping=1.0,
                  cuda_device=cuda_device,
                  checkpointer=checkpointer)
trainer.train()

accuracy: 0.4711, loss: 1.5759 ||: 100%|██████████| 3968/3968 [01:33<00:00, 42.31it/s]
accuracy: 0.6719, loss: 1.2086 ||: 100%|██████████| 11/11 [00:00<00:00, 158.74it/s]
accuracy: 0.7106, loss: 0.9358 ||: 100%|██████████| 3968/3968 [01:26<00:00, 45.79it/s]
accuracy: 0.7589, loss: 1.0635 ||: 100%|██████████| 11/11 [00:00<00:00, 210.23it/s]
accuracy: 0.7921, loss: 0.6982 ||: 100%|██████████| 3968/3968 [01:26<00:00, 45.71it/s]
accuracy: 0.7621, loss: 1.0758 ||: 100%|██████████| 11/11 [00:00<00:00, 171.38it/s]


{'best_epoch': 1,
 'peak_cpu_memory_MB': 5321.216,
 'peak_gpu_0_memory_MB': 2474,
 'training_duration': '0:04:28.852414',
 'training_start_epoch': 0,
 'training_epochs': 2,
 'epoch': 2,
 'training_accuracy': 0.7920514830116302,
 'training_loss': 0.698194510836695,
 'training_cpu_memory_MB': 5321.216,
 'training_gpu_0_memory_MB': 2474,
 'validation_accuracy': 0.7620528771384136,
 'validation_loss': 1.0757639489390634,
 'best_validation_accuracy': 0.7589424572317263,
 'best_validation_loss': 1.0634593584320762}

#### Load and evaluate
---

In [81]:
from allennlp.predictors import TextClassifierPredictor

In [82]:
model1 = LstmTagger(word_embeddings, lstm, vocab)
with open("checkpoints/ckpt-1/best.th", "rb") as f:
    model1.load_state_dict(torch.load(f))
    if torch.cuda.is_available():
        cuda_device = 0
        model1 = model1.cuda(cuda_device)
    else:
        cuda_device = -1

In [83]:
predictor1 = TextClassifierPredictor(model1, dataset_reader=reader)