In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from typing import *
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util

In [3]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed=1,
    batch_size=64,
    lr=3e-4,
    epochs=2,
    hidden_sz=64,
    max_seq_len=100, # necessary to limit memory usage
    max_vocab_size=100000,
)

In [4]:
from allennlp.common.checks import ConfigurationError

In [5]:
USE_GPU = torch.cuda.is_available()

In [6]:
DATA_ROOT = Path("../data") / "jigsaw"

Set random seed manually to replicate results

In [7]:
torch.manual_seed(config.seed)

<torch._C.Generator at 0x10f4fd670>

# Load Data

In [8]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader

### Prepare dataset

In [9]:
label_cols = ["toxic", "severe_toxic", "obscene",
              "threat", "insult", "identity_hate"]

In [10]:
from allennlp.data.fields import TextField, MetadataField, ArrayField

class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token], id: str,
                         labels: np.ndarray) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
        
        id_field = MetadataField(id)
        fields["id"] = id_field
        
        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(1000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["comment_text"])],
                row["id"], row[label_cols].values,
            )

### Prepare token handlers

We will use the spacy tokenizer here

In [11]:
from allennlp.data.token_indexers import PretrainedBertIndexer

token_indexer = PretrainedBertIndexer(
    pretrained_model="bert-base-uncased",
    max_pieces=config.max_seq_len,
    do_lowercase=True,
 )
# apparently we need to truncate the sequence here, which is a stupid design decision
def tokenizer(s: str):
    return token_indexer.wordpiece_tokenizer(s)[:config.max_seq_len - 2]

01/28/2019 22:27:45 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/keitakurita/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [12]:
reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [13]:
train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"])
val_ds = None

1000it [00:03, 325.71it/s]
1000it [00:02, 439.67it/s]


In [14]:
len(train_ds)

1000

In [15]:
train_ds

[<allennlp.data.instance.Instance at 0x1a29170400>,
 <allennlp.data.instance.Instance at 0x1a29170ef0>,
 <allennlp.data.instance.Instance at 0x1a29177588>,
 <allennlp.data.instance.Instance at 0x1a2917dd68>,
 <allennlp.data.instance.Instance at 0x1a291825f8>,
 <allennlp.data.instance.Instance at 0x1a29182cf8>,
 <allennlp.data.instance.Instance at 0x1a29174048>,
 <allennlp.data.instance.Instance at 0x1a29174c50>,
 <allennlp.data.instance.Instance at 0x1a2918c390>,
 <allennlp.data.instance.Instance at 0x1a2918c978>,
 <allennlp.data.instance.Instance at 0x1a291980f0>,
 <allennlp.data.instance.Instance at 0x1a29198908>,
 <allennlp.data.instance.Instance at 0x1a291917f0>,
 <allennlp.data.instance.Instance at 0x1a291a00f0>,
 <allennlp.data.instance.Instance at 0x1a291a3390>,
 <allennlp.data.instance.Instance at 0x1a291a4550>,
 <allennlp.data.instance.Instance at 0x10c14f0f0>,
 <allennlp.data.instance.Instance at 0x10c14f358>,
 <allennlp.data.instance.Instance at 0x10c14fcc0>,
 <allennlp.data

In [16]:
vars(train_ds[0].fields["tokens"])

{'tokens': [[UNK],
  [UNK],
  the,
  edit,
  ##s,
  made,
  under,
  my,
  user,
  ##name,
  [UNK],
  [UNK],
  [UNK],
  were,
  reverted,
  ##?,
  [UNK],
  weren,
  ##',
  ##t,
  van,
  ##dal,
  ##isms,
  ##,,
  just,
  closure,
  on,
  some,
  [UNK],
  after,
  [UNK],
  voted,
  at,
  [UNK],
  [UNK],
  [UNK],
  [UNK],
  [UNK],
  please,
  don,
  ##',
  ##t,
  remove,
  the,
  template,
  from,
  the,
  talk,
  page,
  since,
  [UNK],
  retired,
  now,
  ##.,
  ##8,
  ##9,
  ##.,
  ##20,
  ##5,
  ##.,
  ##38,
  ##.,
  ##27],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.wordpiece_indexer.PretrainedBertIndexer at 0x1a28656470>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

### Prepare vocabulary

In [17]:
vocab = Vocabulary.from_instances(train_ds, max_vocab_size=config.max_vocab_size)

01/28/2019 22:27:51 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.
100%|██████████| 1000/1000 [00:00<00:00, 83610.17it/s]


### Prepare iterator

The iterator is responsible for batching the data and preparing it for input into the model. We'll use the BucketIterator that batches text sequences of smilar lengths together.

In [18]:
from allennlp.data.iterators import BucketIterator

In [19]:
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("tokens", "num_tokens")],
                         )

We need to tell the iterator how to numericalize the text data. We do this by passing the vocabulary to the iterator. This step is easy to forget so be careful! 

In [20]:
iterator.index_with(vocab)

### Read sample

In [21]:
batch = next(iter(iterator(train_ds)))

In [22]:
batch

{'tokens': {'tokens': tensor([[  101,   100,  1011,  ...,     0,     0,     0],
          [  101,   100,   100,  ...,     0,     0,     0],
          [  101,  1000,   100,  ...,     0,     0,     0],
          ...,
          [  101,  1998,   100,  ...,   102,     0,     0],
          [  101,   100,  2013,  ..., 29625,  1011,   102],
          [  101,   100,   100,  ...,   102,     0,     0]]),
  'tokens-offsets': tensor([[ 1,  2,  3,  ...,  0,  0,  0],
          [ 1,  2,  3,  ...,  0,  0,  0],
          [ 1,  2,  3,  ...,  0,  0,  0],
          ...,
          [ 1,  2,  3,  ..., 41,  0,  0],
          [ 1,  2,  3,  ..., 41, 42, 43],
          [ 1,  2,  3,  ..., 41,  0,  0]]),
  'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          ...,
          [1, 1, 1,  ..., 1, 0, 0],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 0, 0]])},
 'id': ['012db3deb39d94ca',
  '0073059e6433db47',
  '02ae218e901a58f0',
  

In [23]:
batch["tokens"]["tokens"]

tensor([[  101,   100,  1011,  ...,     0,     0,     0],
        [  101,   100,   100,  ...,     0,     0,     0],
        [  101,  1000,   100,  ...,     0,     0,     0],
        ...,
        [  101,  1998,   100,  ...,   102,     0,     0],
        [  101,   100,  2013,  ..., 29625,  1011,   102],
        [  101,   100,   100,  ...,   102,     0,     0]])

In [24]:
batch["tokens"]["tokens"].shape

torch.Size([64, 45])

# Prepare Model

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim

In [26]:
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder

class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int=len(label_cols)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = nn.BCEWithLogitsLoss()
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                id: Any, label: torch.Tensor) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        
        output = {"class_logits": class_logits}
        output["loss"] = self.loss(class_logits, label)

        return output

### Prepare embeddings

In [27]:
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.bert_token_embedder import PretrainedBertEmbedder

bert_embedder = PretrainedBertEmbedder(
        pretrained_model="bert-base-uncased",
        top_layer_only=True, # conserve memory
)
word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": bert_embedder},
                                                            # we'll be ignoring masks so we'll need to set this to True
                                                           allow_unmatched_keys = True)

01/28/2019 22:27:52 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/keitakurita/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
01/28/2019 22:27:52 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/keitakurita/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/hy/1czs1y5j2d58zgkqx6w_wnpw0000gn/T/tmp4seeko5v
01/28/2019 22:27:56 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads"

In [28]:
BERT_DIM = word_embeddings.get_output_dim()

class BertSentencePooler(Seq2VecEncoder):
    def forward(self, embs: torch.tensor, 
                mask: torch.tensor=None) -> torch.tensor:
        # extract first token tensor
        return embs[:, 0]
    
    @overrides
    def get_output_dim(self) -> int:
        return BERT_DIM
    
encoder = BertSentencePooler(vocab)

Notice how simple and modular the code for initializing the model is. All the complexity is delegated to each component.

In [29]:
model = BaselineModel(
    word_embeddings, 
    encoder, 
)

In [30]:
if USE_GPU: model.cuda()
else: model

# Basic sanity checks

In [31]:
batch = nn_util.move_to_device(batch, 0 if USE_GPU else -1)

In [32]:
tokens = batch["tokens"]
labels = batch

In [33]:
tokens

{'tokens': tensor([[  101,   100,  1011,  ...,     0,     0,     0],
         [  101,   100,   100,  ...,     0,     0,     0],
         [  101,  1000,   100,  ...,     0,     0,     0],
         ...,
         [  101,  1998,   100,  ...,   102,     0,     0],
         [  101,   100,  2013,  ..., 29625,  1011,   102],
         [  101,   100,   100,  ...,   102,     0,     0]]),
 'tokens-offsets': tensor([[ 1,  2,  3,  ...,  0,  0,  0],
         [ 1,  2,  3,  ...,  0,  0,  0],
         [ 1,  2,  3,  ...,  0,  0,  0],
         ...,
         [ 1,  2,  3,  ..., 41,  0,  0],
         [ 1,  2,  3,  ..., 41, 42, 43],
         [ 1,  2,  3,  ..., 41,  0,  0]]),
 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 0, 0]])}

In [34]:
mask = get_text_field_mask(tokens)
mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 0, 0]])

In [35]:
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)
class_logits

tensor([[ 4.6029e-01,  1.0634e-01,  4.2201e-01,  2.4401e-03,  3.0394e-01,
          4.6073e-02],
        [ 5.8157e-01,  2.3274e-01,  1.7322e-01, -1.8333e-01,  1.6538e-01,
         -8.5568e-02],
        [ 3.6025e-01,  1.9680e-01,  1.3414e-01, -2.9591e-01, -5.9149e-02,
          8.8200e-03],
        [ 5.9896e-01,  6.4951e-02,  3.8953e-01, -1.4752e-02,  1.8939e-02,
         -6.5000e-02],
        [ 3.8613e-01,  1.7148e-01,  4.1873e-02, -1.5916e-01,  3.0099e-02,
          3.7296e-02],
        [ 5.6207e-01,  9.9249e-02, -1.8001e-02, -1.9459e-01, -5.0729e-02,
         -2.9151e-03],
        [ 4.1298e-01,  1.2324e-01,  3.6444e-02, -3.0327e-01, -2.3738e-01,
         -3.7295e-01],
        [ 6.1968e-01,  3.2144e-01,  1.1170e-01, -1.8397e-01,  1.5092e-02,
          3.6151e-03],
        [ 3.9946e-01,  3.1188e-01,  1.2117e-01, -4.2895e-02, -2.6264e-02,
         -6.9874e-02],
        [ 4.3297e-01,  4.6959e-02,  8.8077e-02, -1.2706e-01,  5.6333e-02,
          9.3291e-04],
        [ 6.5024e-01,  2.8164e

In [36]:
model(**batch)

{'class_logits': tensor([[ 4.4547e-01,  3.1156e-02,  2.4213e-01, -1.5971e-01, -5.9123e-02,
          -2.1293e-01],
         [ 5.4509e-01,  2.0774e-01,  1.5047e-01, -1.2310e-01,  2.0172e-01,
          -1.3649e-01],
         [ 4.0834e-01,  2.5043e-01,  1.9857e-02, -2.2050e-01, -2.7475e-04,
          -9.1278e-02],
         [ 5.8635e-01,  2.3285e-01,  2.6801e-01, -9.2788e-02,  6.1501e-02,
          -2.2691e-01],
         [ 3.4393e-01,  2.9836e-02,  9.7074e-02, -3.1715e-01,  6.2563e-02,
           1.4879e-01],
         [ 3.9581e-01,  1.6111e-01,  1.0803e-01, -2.8581e-01,  4.4001e-02,
           2.6459e-02],
         [ 4.9049e-01, -4.6142e-03,  1.8441e-01, -2.1090e-01, -9.4795e-02,
          -2.0507e-01],
         [ 5.1780e-01,  1.9256e-01,  1.8620e-01, -2.5154e-01,  1.0383e-01,
          -1.4371e-01],
         [ 4.1739e-01,  3.1606e-01,  5.6454e-02, -1.6621e-01,  1.0353e-01,
          -1.4039e-01],
         [ 3.3645e-01,  1.2266e-01, -1.0355e-01, -1.1724e-01, -6.0761e-02,
           3.5463e

In [37]:
loss = model(**batch)["loss"]

In [38]:
loss

tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [39]:
loss.backward()

In [40]:
[x.grad for x in list(model.encoder.parameters())]

[]

# Train

In [41]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [42]:
from allennlp.training.trainer import Trainer

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_ds,
    cuda_device=0 if USE_GPU else -1,
    num_epochs=config.epochs,
)

In [43]:
metrics = trainer.train()

01/28/2019 22:28:19 - INFO - allennlp.training.trainer -   Beginning training.
01/28/2019 22:28:19 - INFO - allennlp.training.trainer -   Epoch 0/1
01/28/2019 22:28:19 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 1533.68576
01/28/2019 22:28:19 - INFO - allennlp.training.trainer -   Training
loss: 0.6063 ||: 100%|██████████| 16/16 [02:07<00:00,  6.36s/it]
01/28/2019 22:30:26 - INFO - allennlp.training.trainer -                     Training |  Validation
01/28/2019 22:30:26 - INFO - allennlp.training.trainer -   loss          |     0.606  |       N/A
01/28/2019 22:30:26 - INFO - allennlp.training.trainer -   cpu_memory_MB |  1533.686  |       N/A
01/28/2019 22:30:26 - INFO - allennlp.training.trainer -   Epoch duration: 00:02:07
01/28/2019 22:30:26 - INFO - allennlp.training.trainer -   Estimated training time remaining: 0:02:07
01/28/2019 22:30:26 - INFO - allennlp.training.trainer -   Epoch 1/1
01/28/2019 22:30:26 - INFO - allennlp.training.trainer -   Peak CPU memo