In [1]:
from pathlib import Path
from typing import *
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token

In [2]:
import time
from contextlib import contextmanager

class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')
    
import functools
import traceback
def get_ref_free_exc_info():
    "Free traceback from references to locals/globals to avoid circular reference leading to gc.collect() unable to reclaim memory"
    type, val, tb = sys.exc_info()
    traceback.clear_frames(tb)
    return (type, val, tb)

def gpu_mem_restore(func):
    "Reclaim GPU RAM if CUDA out of memory happened, or execution was interrupted"
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            type, val, tb = get_ref_free_exc_info() # must!
            raise type(val).with_traceback(tb) from None
    return wrapper

In [3]:
# for papermill
testing = True
seed = 1
computational_batch_size = 256
batch_size = 256
lr = 0.001
epochs = 15
hidden_sz = 64
dataset = "jigsaw"
n_classes = 6
max_seq_len = 512
download_data = False
ft_model_path = "../data/jigsaw/ft_model.bin"
max_vocab_size = 300000
dropoute = 0.5
run_id = None

In [4]:
import subprocess
if download_data:
    for fname in ["train.csv", "test_proced.csv"]:
        subprocess.run(["aws", "s3", "cp", f"s3://nnfornlp/data/jigsaw/{fname}"], 
                       shell=True, check=True)

In [5]:
# TODO: Can we make this play better with papermill?
config = Config(
    testing=testing,
    seed=seed,
    computational_batch_size=computational_batch_size,
    batch_size=batch_size,
    lr=lr,
    epochs=epochs,
    hidden_sz=hidden_sz,
    dataset=dataset,
    n_classes=n_classes,
    max_seq_len=max_seq_len, # necessary to limit memory usage
    ft_model_path=ft_model_path,
    max_vocab_size=max_vocab_size,
    dropoute=dropoute,
    run_id=run_id,
)

In [6]:
from allennlp.common.checks import ConfigurationError

In [7]:
import datetime
now = datetime.datetime.now()
RUN_ID = config.run_id if config.run_id is not None else now.strftime("%m_%d_%H:%M:%S")

In [8]:
USE_GPU = torch.cuda.is_available()

In [9]:
DATA_ROOT = Path("../data") / config.dataset

Set random seed manually to replicate results

In [10]:
torch.manual_seed(config.seed)

<torch._C.Generator at 0x11861cf30>

# Load Data

In [11]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader, StanfordSentimentTreeBankDatasetReader

### Prepare dataset

In [12]:
reader_registry = {}

In [13]:
def register(name: str):
    def dec(x: Callable):
        reader_registry[name] = x
        return x
    return dec

In [14]:
label_cols = ["toxic", "severe_toxic", "obscene",
              "threat", "insult", "identity_hate"]

from enum import IntEnum
ColIdx = IntEnum('ColIdx', [(x.upper(), i) for i, x in enumerate(label_cols)])

In [15]:
@register("jigsaw")
class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None, # TODO: Handle mapping from BERT
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token],
                         toxic: int, severe_toxic: int, obscene: int,
                         threat: int, insult: int, identity_hate: int) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}

        toxic_field = LabelField(label=toxic, skip_indexing=True)
        fields["toxic"] = toxic_field
        
        severe_toxic_field = LabelField(label=severe_toxic, skip_indexing=True)
        fields["severe_toxic"] = severe_toxic_field
        
        obscene_field = LabelField(label=obscene, skip_indexing=True)
        fields["obscene"] = obscene_field
        
        threat_field = LabelField(label=threat, skip_indexing=True)
        fields["threat"] = threat_field
        
        insult_field = LabelField(label=insult, skip_indexing=True)
        fields["insult"] = insult_field
        
        identity_hate_field = LabelField(label=identity_hate, skip_indexing=True)
        fields["identity_hate"] = identity_hate_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(10000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["comment_text"])],
                row["toxic"], row["severe_toxic"], row["obscene"], 
                row["threat"], row["insult"], row["identity_hate"],
            )

### Prepare token handlers

In [16]:
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers import WordpieceIndexer, SingleIdTokenIndexer

In [17]:
token_indexer = SingleIdTokenIndexer(
    lowercase_tokens=False,  # don't lowercase by default
)
def tokenizer(x):
    return [w.text for w in
            SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x)]

In [18]:
reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [19]:
train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"])
val_ds = None

10000it [00:24, 412.34it/s]
10000it [00:31, 316.64it/s]


In [20]:
len(train_ds)

10000

### Prepare vocabulary

In [21]:
vocab = Vocabulary.from_instances(train_ds, max_vocab_size=config.max_vocab_size)

01/26/2019 18:45:41 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.
100%|██████████| 10000/10000 [00:00<00:00, 17207.79it/s]


### Prepare iterator

In [22]:
from allennlp.data.iterators import BucketIterator, DataIterator

In [23]:
# TODO: Allow for customization
iterator = BucketIterator(batch_size=config.batch_size, 
                          biggest_batch_first=True,
                          sorting_keys=[("tokens", "num_tokens")],
                         )
iterator.index_with(vocab)

### Read sample

In [24]:
batch = next(iter(iterator(train_ds)))

In [25]:
batch

{'tokens': {'tokens': tensor([[    5,     7,    26,  ...,     0,     0,     0],
          [    5, 23538,  3225,  ...,     0,     0,     0],
          [    5,    24, 11937,  ...,     0,     0,     0],
          ...,
          [  167,    69,     3,  ...,     0,     0,     0],
          [    7,    82,   139,  ...,     0,     0,     0],
          [   32,   178,   221,  ...,     7,   124,     2]])},
 'toxic': tensor([0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1]),
 'severe_toxic': tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1]),
 'obscene': tensor([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1]),
 'threat': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]),
 'insult': tensor([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1]),
 'identity_hate': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [26]:
batch["tokens"]["tokens"]

tensor([[    5,     7,    26,  ...,     0,     0,     0],
        [    5, 23538,  3225,  ...,     0,     0,     0],
        [    5,    24, 11937,  ...,     0,     0,     0],
        ...,
        [  167,    69,     3,  ...,     0,     0,     0],
        [    7,    82,   139,  ...,     0,     0,     0],
        [   32,   178,   221,  ...,     7,   124,     2]])

In [27]:
batch["tokens"]["tokens"].shape

torch.Size([16, 2086])

# Prepare Model

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim

In [29]:
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.token_embedders.bert_token_embedder import BertEmbedder, PretrainedBertEmbedder
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.stacked_bidirectional_lstm import StackedBidirectionalLstm
from allennlp.nn.util import get_text_field_mask

In [30]:
class Attention(nn.Module):
    def __init__(self, inp_sz, dim=1, eps=1e-9):
        super().__init__()
        self.inp_sz, self.dim, self.eps = inp_sz, dim, eps
        self.l1 = nn.Linear(inp_sz, inp_sz)
        nn.init.xavier_uniform_(self.l1.weight.data)
        nn.init.zeros_(self.l1.bias.data)
        
        vw = torch.zeros(inp_sz, 1)
        nn.init.xavier_uniform_(vw)        
        self.vw = nn.Parameter(vw)
        
    def forward(self, x, mask=None):
        e = torch.tanh(self.l1(x))
        e = torch.einsum("bij,jk->bi", [e, self.vw])            
        a = torch.exp(e)
        
        if mask is not None: a = a.masked_fill(mask, 0)

        a = a / (torch.sum(a, dim=self.dim, keepdim=True) + self.eps)

        weighted_input = x * a.unsqueeze(-1)
        return torch.sum(weighted_input, dim=1), a

In [31]:
def init_gru_weights(gru: nn.GRU):
    """Applies orthogonal and xavier uniform initialization according to best practices"""
    for nm, param in gru.named_parameters():
        if "weight_hh" in nm:
            nn.init.orthogonal_(param.data)
        elif "weight_ih" in nm:
            nn.init.xavier_uniform_(param.data)

In [32]:
class BiGRUAttentionEncoder(Seq2VecEncoder):
    def __init__(self, embed_sz: int, hidden_sz: int, num_layers=2):
        super().__init__()
        self.embed_sz = embed_sz
        self.hidden_sz = hidden_sz
        self.gru = nn.GRU(self.embed_sz, self.hidden_sz,
                          num_layers=num_layers, bidirectional=True)
        init_gru_weights(self.gru)
        self.attention = Attention(self.hidden_sz * 2, dim=1)
        
    @overrides
    def get_input_dim(self) -> int:
        return self.embed_sz
    
    @overrides
    def get_output_dim(self) -> int:
        return self.hidden_sz * 2
    
    @overrides
    def forward(self, x: torch.tensor, 
                mask: Optional[torch.tensor]=None) -> torch.tensor:
        x, _ = self.gru(x, None)
        x, _ = self.attention(x, mask=mask)
        return x

In [33]:
from allennlp.training.metrics import CategoricalAccuracy, BooleanAccuracy, Metric

def prod(x: Iterable):
    acc = 1
    for v in x: acc *= v
    return acc

class MultilabelAccuracy(Metric):
    def __init__(self, thres=0.5):
        self.thres = 0.5
        self.correct_count = 0
        self.total_count = 0
    
    def __call__(self, logits: torch.FloatTensor, 
                 t: torch.LongTensor) -> float:
        logits = logits.detach().cpu().numpy()
        t = t.detach().cpu().numpy()
        cc = ((logits >= self.thres) == t).sum()
        tc = prod(logits.shape)
        self.correct_count += cc
        self.total_count += tc
        return cc / tc
    
    def get_metric(self, reset: bool=False):
        acc = self.correct_count / self.total_count
        if reset:
            self.reset()
        return acc
    
    @overrides
    def reset(self):
        self.correct_count = 0
        self.total_count = 0
    
class MultilabelCrossEntropyLoss(nn.Module):
    def forward(self, lgt, tgt: torch.LongTensor):
        neg_abs = -lgt.abs()
        loss = lgt.clamp(min=0) - lgt * tgt.float() + (1 + neg_abs.exp()).log()
        return loss.mean()

In [34]:
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder

class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int=config.n_classes,
                 multilabel: bool=True):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.multilabel = multilabel
        # TODO: Handle multiclass case
        if self.multilabel:
            self.accuracy = MultilabelAccuracy()
            self.per_label_acc = {c: MultilabelAccuracy() for c in label_cols}
            self.loss = MultilabelCrossEntropyLoss()
        else:
            self.loss = nn.CrossEntropyLoss()
            self.accuracy = CategoricalAccuracy()
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                **labels: torch.Tensor) -> torch.Tensor:
        mask = get_text_field_mask(tokens) == 0
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        
        output = {"class_logits": class_logits}
        if len(labels) > 0:
            # This is grossly inefficient...
            label = torch.cat([labels[c].unsqueeze(-1) for c in label_cols], dim=1)
            output["accuracy"] = self.accuracy(class_logits, label)
            for i, c in enumerate(label_cols):
                output[f"{c}_acc"] = self.per_label_acc[c](class_logits[:, i], 
                                                          labels[c])
            output["loss"] = self.loss(class_logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

### Prepare embeddings

In [35]:
import fastText

def get_fasttext_embeddings(model_path: str, vocab: Vocabulary):
    vocab_size = min(vocab.get_vocab_size(), config.max_vocab_size)
    ft_model = fastText.load_model(config.ft_model_path)
    embedding_dim = ft_model.get_dimension()

    # register parameters
    config.set("vocab_size", vocab_size)
    config.set("embedding_dim", embedding_dim)
    
    embeddings = np.zeros((vocab_size + 5, embedding_dim))
    for idx, token in vocab.get_index_to_token_vocabulary().items():
        embeddings[idx, :] = ft_model.get_word_vector(token)
    
    return embeddings

In [36]:
with timer("Loading embeddings"):
    embedding_weights = get_fasttext_embeddings(config.ft_model_path, vocab)

[Loading embeddings] done in 1 s


In [37]:
class CustomEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim,
                 padding_index=None, max_norm=None,
                 weight=None, dropout=0., scale=None):
        super().__init__()
        self.dropout = dropout
        self.scale = scale
        self.padding_idx = padding_index
        self.embed = Embedding(num_embeddings, embedding_dim,
                               padding_index=padding_index, max_norm=max_norm,
                               weight=weight)
    
    def forward(self, words):
        weight = self.embed.weight
        if self.dropout > 0.0 and self.training:
            mask = self.embed.weight.data.new().resize_((weight.size(0), 1)).bernoulli_(1 - self.dropout).expand_as(weight) / (1 - self.dropout)
            masked_embed_weight = mask * weight
        else:
            masked_embed_weight = weight
        if self.scale:
            masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight

        padding_idx = self.padding_idx
        if padding_idx is None:
            padding_idx = -1

        X = torch.nn.functional.embedding(words, masked_embed_weight,
            padding_idx, self.embed.max_norm, self.embed.norm_type,
            self.embed.scale_grad_by_freq, self.embed.sparse
          )
        return X

In [38]:
token_embedding = CustomEmbedding(num_embeddings=config.vocab_size + 5,
                                  embedding_dim=config.embedding_dim,
                                  weight=torch.tensor(embedding_weights, dtype=torch.float),
                                  dropout=config.dropoute, padding_index=0)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
encoder = BiGRUAttentionEncoder(
    config.embedding_dim, 
    config.hidden_sz,
)

In [39]:
model = BaselineModel(
    word_embeddings, 
    encoder, 
    out_sz=config.n_classes,
)

Initialize bias according to prior

In [40]:
train_labels = pd.read_csv(DATA_ROOT / "train.csv")[label_cols].values
if config.testing: 
    train_labels = train_labels[:10000, :]
    
class_bias = torch.zeros(len(label_cols))
for i, _ in enumerate(label_cols):
    p = train_labels[:, i].mean()
    class_bias[i] = np.log(p / (1-p))

In [41]:
model.projection.bias.data = class_bias

In [42]:
if USE_GPU: model.cuda()
else: model

### Basic sanity checks

In [43]:
np.isnan(list(model.word_embeddings.parameters())[0].detach().numpy()).any()

False

In [44]:
[np.isnan(x.detach().numpy()).any() for x in list(model.encoder.parameters())]

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [45]:
[np.isinf(x.detach().numpy()).any() for x in list(model.encoder.parameters())]

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [46]:
tokens = batch["tokens"]
labels = batch

mask = get_text_field_mask(tokens) == 0
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)

In [47]:
tokens

{'tokens': tensor([[    5,     7,    26,  ...,     0,     0,     0],
         [    5, 23538,  3225,  ...,     0,     0,     0],
         [    5,    24, 11937,  ...,     0,     0,     0],
         ...,
         [  167,    69,     3,  ...,     0,     0,     0],
         [    7,    82,   139,  ...,     0,     0,     0],
         [   32,   178,   221,  ...,     7,   124,     2]])}

In [48]:
mask

tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.uint8)

In [49]:
mask

tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.uint8)

In [50]:
model(**batch)

{'class_logits': tensor([[-2.2214, -4.5855, -2.9599, -5.7243, -2.9847, -4.7656],
         [-2.2176, -4.5764, -2.9719, -5.7270, -2.9942, -4.7660],
         [-2.2172, -4.5728, -2.9734, -5.7256, -3.0006, -4.7664],
         [-2.2138, -4.5756, -2.9653, -5.7184, -3.0054, -4.7701],
         [-2.2163, -4.5725, -2.9607, -5.7154, -3.0111, -4.7737],
         [-2.2163, -4.5768, -2.9561, -5.7147, -3.0129, -4.7753],
         [-2.2187, -4.5790, -2.9527, -5.7206, -3.0119, -4.7732],
         [-2.2131, -4.5899, -2.9474, -5.7227, -3.0128, -4.7672],
         [-2.2074, -4.5879, -2.9397, -5.7344, -3.0128, -4.7548],
         [-2.2235, -4.5761, -2.9421, -5.7403, -3.0104, -4.7591],
         [-2.2301, -4.5640, -2.9435, -5.7523, -3.0081, -4.7588],
         [-2.2085, -4.5381, -2.9409, -5.7687, -3.0019, -4.7552],
         [-2.1954, -4.4909, -2.9367, -5.7923, -3.0307, -4.7659],
         [-2.2185, -4.4934, -2.9344, -5.7322, -3.0476, -4.8321],
         [-2.2306, -4.5141, -2.9241, -5.7001, -3.0660, -4.8367],
         

In [51]:
loss = model(**batch)["loss"]

In [52]:
loss

tensor(0.9506, grad_fn=<MeanBackward1>)

In [53]:
loss.backward()

In [54]:
[x.grad for x in list(model.encoder.parameters())]

[tensor([[-4.4918e-06,  1.1074e-05,  1.8466e-06,  ...,  6.4138e-06,
          -7.3518e-07, -2.1079e-06],
         [-2.3124e-06,  1.0786e-05,  2.0789e-06,  ...,  4.7372e-06,
          -2.6070e-06,  7.6327e-07],
         [-3.1135e-07, -1.4808e-06, -1.2328e-06,  ...,  3.4717e-07,
           8.6560e-07, -1.0413e-06],
         ...,
         [ 2.8734e-05, -8.5067e-05, -4.6237e-05,  ..., -4.6114e-05,
           3.6131e-05, -8.2640e-05],
         [-6.1587e-05,  1.9873e-04,  1.6565e-04,  ...,  1.2352e-05,
          -2.7602e-05,  1.0193e-04],
         [-5.3004e-04,  1.8178e-03,  2.4155e-04,  ...,  8.0014e-04,
          -1.3717e-04,  2.3603e-04]]),
 tensor([[ 5.2614e-06, -2.8081e-06, -4.9264e-06,  ...,  2.5151e-06,
           2.2152e-06,  2.3523e-06],
         [ 3.0922e-06, -1.1978e-06, -1.0779e-05,  ...,  9.8048e-06,
           7.8398e-06, -2.0084e-07],
         [-9.5489e-07, -1.6294e-06,  2.9049e-06,  ..., -1.9869e-06,
          -8.8423e-07,  5.1661e-07],
         ...,
         [-5.1531e-05,  2

# Train

In [55]:
from allennlp.training import trainer as _trainer
from allennlp.training.trainer import *
import math
logger = _trainer.logger

N_BATCHES_PER_UPDATE = config.batch_size // config.computational_batch_size

class CustomTrainer(Trainer):
    @gpu_mem_restore
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics. Copied from source
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        # Get tqdm for the training batches
        train_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        num_training_batches = self.iterator.get_num_batches(self.train_data)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                    batch_num_total % self._histogram_interval == 0)

            self.optimizer.zero_grad()
            
            ###########
            # Custom  #
            ###########
            loss = self.batch_loss(batch, for_training=True)
            if torch.isnan(loss):
                raise ValueError("nan loss encountered")
            train_loss += loss.item()
            # wait to update
            if (batches_this_epoch % N_BATCHES_PER_UPDATE) != 0: continue
            ###############
            # End Custom  #
            ###############
            
            loss.backward()
            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self.model.named_parameters()}
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7),
                                                       batch_num_total)
            else:
                self.optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if batch_num_total % self._summary_interval == 0:
                if self._should_log_parameter_statistics:
                    self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm)
                if self._should_log_learning_rate:
                    self._learning_rates_to_tensorboard(batch_num_total)
                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total)
                self._metrics_to_tensorboard(batch_num_total,
                                             {"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._log_histograms_this_batch:
                self._histograms_to_tensorboard(batch_num_total, histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = self._get_batch_size(batch)
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size/batches_this_epoch
                    logger.info(f"current batch size: {cur_batch} mean batch size: {average}")
                    self._tensorboard.add_train_scalar("current_batch_size", cur_batch, batch_num_total)
                    self._tensorboard.add_train_scalar("mean_batch_size", average, batch_num_total)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False
                )
        metrics = self._get_metrics(train_loss, batches_this_epoch, reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory
        return metrics

In [56]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [57]:
training_options = {
    # TODO: Add appropriate learning rate scheduler
    "should_log_parameter_statistics": True,
    "should_log_learning_rate": True,
    "num_epochs": config.epochs,
}

In [58]:
SER_DIR = DATA_ROOT / "ckpts" / RUN_ID

trainer = CustomTrainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_ds,
    validation_dataset=val_ds,
    serialization_dir=SER_DIR,
    cuda_device=0 if USE_GPU else -1,
    **training_options,
)

In [59]:
metrics = trainer.train()

01/26/2019 18:47:03 - INFO - allennlp.training.trainer -   Beginning training.
01/26/2019 18:47:03 - INFO - allennlp.training.trainer -   Epoch 0/14
01/26/2019 18:47:03 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 2383.70816
01/26/2019 18:47:03 - INFO - allennlp.training.trainer -   Training
accuracy: 0.9624, loss: 0.1591 ||: 100%|██████████| 40/40 [01:24<00:00,  1.75s/it]
01/26/2019 18:48:28 - INFO - allennlp.training.trainer -                     Training |  Validation
01/26/2019 18:48:28 - INFO - allennlp.training.trainer -   cpu_memory_MB |  2383.708  |       N/A
01/26/2019 18:48:28 - INFO - allennlp.training.trainer -   accuracy      |     0.962  |       N/A
01/26/2019 18:48:28 - INFO - allennlp.training.trainer -   loss          |     0.159  |       N/A
01/26/2019 18:48:28 - INFO - allennlp.training.trainer -   Best validation performance so far. Copying weights to '../data/jigsaw/ckpts/01_26_18:43:28/best.th'.
01/26/2019 18:48:28 - INFO - allennlp.training.tr

01/26/2019 18:57:26 - INFO - allennlp.training.trainer -   Best validation performance so far. Copying weights to '../data/jigsaw/ckpts/01_26_18:43:28/best.th'.
01/26/2019 18:57:26 - INFO - allennlp.training.trainer -   Epoch duration: 00:01:16
01/26/2019 18:57:26 - INFO - allennlp.training.trainer -   Estimated training time remaining: 0:09:04
01/26/2019 18:57:26 - INFO - allennlp.training.trainer -   Epoch 8/14
01/26/2019 18:57:26 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 5579.460608
01/26/2019 18:57:26 - INFO - allennlp.training.trainer -   Training
accuracy: 0.9738, loss: 0.0869 ||: 100%|██████████| 40/40 [01:13<00:00,  3.65s/it]
01/26/2019 18:58:40 - INFO - allennlp.training.trainer -                     Training |  Validation
01/26/2019 18:58:40 - INFO - allennlp.training.trainer -   cpu_memory_MB |  5579.461  |       N/A
01/26/2019 18:58:40 - INFO - allennlp.training.trainer -   accuracy      |     0.974  |       N/A
01/26/2019 18:58:40 - INFO - allennlp.t

In [61]:
metrics

{'peak_cpu_memory_MB': 5741.789184,
 'training_duration': '00:18:46',
 'training_start_epoch': 0,
 'training_epochs': 14,
 'epoch': 14,
 'training_accuracy': 0.9795,
 'training_loss': 0.06179733141325414,
 'training_cpu_memory_MB': 5741.789184,
 'best_epoch': 14}

# Evaluate

In [62]:
from scipy.special import expit

In [63]:
class Predictor:
    def __init__(self, model: Model, iterator: DataIterator) -> None:
        self.model = model
        self.iterator = iterator
        
    def _extract_preds(self, out_dict: dict) -> np.ndarray:
        return out_dict["class_logits"].detach().cpu().numpy()
        
    def _postprocess(self, predictions: List[np.ndarray]) -> np.ndarray:
        return expit(np.concatenate(predictions, axis=0))
        
    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = Tqdm.tqdm(pred_generator,
                                        total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                out_dict = self.model(**batch)
                preds.append(self._extract_preds(out_dict))
        return self._postprocess(preds)

In [64]:
from allennlp.data.iterators import BasicIterator
seq_iterator = BasicIterator(batch_size=64)

In [66]:
predictor = Predictor(model, seq_iterator)
train_preds =predictor.predict(train_ds) 


  0%|          | 0/157 [00:00<?, ?it/s][A
  1%|          | 1/157 [00:00<01:27,  1.79it/s][A
  1%|▏         | 2/157 [00:01<01:45,  1.47it/s][A
  2%|▏         | 3/157 [00:02<01:44,  1.47it/s][A
  3%|▎         | 4/157 [00:02<01:44,  1.47it/s][A
  3%|▎         | 5/157 [00:03<01:27,  1.74it/s][A
  4%|▍         | 6/157 [00:03<01:36,  1.57it/s][A
  4%|▍         | 7/157 [00:04<01:21,  1.83it/s][A
  5%|▌         | 8/157 [00:04<01:23,  1.79it/s][A
  6%|▌         | 9/157 [00:05<01:23,  1.76it/s][A
  6%|▋         | 10/157 [00:06<01:38,  1.49it/s][A
  7%|▋         | 11/157 [00:07<01:39,  1.47it/s][A
  8%|▊         | 12/157 [00:07<01:30,  1.60it/s][A
  8%|▊         | 13/157 [00:08<01:33,  1.55it/s][A
  9%|▉         | 14/157 [00:09<01:39,  1.44it/s][A
 10%|▉         | 15/157 [00:09<01:32,  1.53it/s][A
 10%|█         | 16/157 [00:10<01:18,  1.80it/s][A
 11%|█         | 17/157 [00:10<01:22,  1.70it/s][A
 11%|█▏        | 18/157 [00:10<01:09,  1.99it/s][A
 12%|█▏        | 19/157 [00:1

 99%|█████████▉| 156/157 [01:27<00:00,  1.80it/s][A
100%|██████████| 157/157 [01:27<00:00,  1.79it/s][A

In [74]:
test_preds = predictor.predict(test_ds)



  0%|          | 0/157 [00:00<?, ?it/s][A[A

ConfigurationError: 'You must call .index(vocabulary) on a field before determining padding lengths.'

In [68]:
test_labels = pd.read_csv(DATA_ROOT / "test_proced.csv")[label_cols].values
if config.testing:
    test_labels = test_labels[:10000, :]

In [75]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix

Per label

In [78]:
from collections import defaultdict
def to_metric_dict(t: np.ndarray, y: np.ndarray, thres=0.5):
    tn, fp, fn, tp = confusion_matrix(t, y >= thres).ravel()
    return {"auc": roc_auc_score(t, y),
            "f1": f1_score(t, y >= thres),
            "acc": accuracy_score(t, y >= thres),
            "tnr": tn / len(t), "fpr": fp / len(t),
            "fnr": fn / len(t), "tpr": tp / len(t),
            "precision": tp / (tp + fp), "recall": tp / (tp + fn),
          }

train_label_metrics = {}
label_metrics = {}
for i, lbl in enumerate(label_cols):
    train_label_metrics[lbl] = to_metric_dict(train_labels[:, i], train_preds[:, i])
    label_metrics[lbl] = to_metric_dict(test_labels[:, i], test_preds[:, i])
    print(f"========{lbl}=========")
    print(label_metrics[lbl])

{'auc': 0.9911234642830135, 'f1': 0.7863554757630161, 'acc': 0.9643, 'tnr': 0.8986, 'fpr': 0.0043, 'fnr': 0.0314, 'tpr': 0.0657, 'precision': 0.9385714285714286, 'recall': 0.6766220391349125}
{'auc': 0.9881406162638692, 'f1': 0.0196078431372549, 'acc': 0.99, 'tnr': 0.9899, 'fpr': 0.0, 'fnr': 0.01, 'tpr': 0.0001, 'precision': 1.0, 'recall': 0.009900990099009901}
{'auc': 0.9951877612413269, 'f1': 0.746820809248555, 'acc': 0.9781, 'tnr': 0.9458, 'fpr': 0.0015, 'fnr': 0.0204, 'tpr': 0.0323, 'precision': 0.9556213017751479, 'recall': 0.6129032258064516}
{'auc': 0.9444865024277084, 'f1': 0.0, 'acc': 0.9967, 'tnr': 0.9967, 'fpr': 0.0, 'fnr': 0.0033, 'tpr': 0.0, 'precision': nan, 'recall': 0.0}
{'auc': 0.9903444745317468, 'f1': 0.6601941747572816, 'acc': 0.972, 'tnr': 0.9448, 'fpr': 0.0058, 'fnr': 0.0222, 'tpr': 0.0272, 'precision': 0.8242424242424242, 'recall': 0.5506072874493927}
{'auc': 0.9777920244338155, 'f1': 0.0, 'acc': 0.9916, 'tnr': 0.9916, 'fpr': 0.0, 'fnr': 0.0084, 'tpr': 0.0, 'prec

  'precision', 'predicted', average, warn_for)
  if __name__ == '__main__':
  'precision', 'predicted', average, warn_for)
  if __name__ == '__main__':
  'precision', 'predicted', average, warn_for)
  if __name__ == '__main__':
  'precision', 'predicted', average, warn_for)
  if __name__ == '__main__':


In [79]:
train_label_metrics

{'toxic': {'auc': 0.9911234642830135,
  'f1': 0.7863554757630161,
  'acc': 0.9643,
  'tnr': 0.8986,
  'fpr': 0.0043,
  'fnr': 0.0314,
  'tpr': 0.0657,
  'precision': 0.9385714285714286,
  'recall': 0.6766220391349125},
 'severe_toxic': {'auc': 0.9881406162638692,
  'f1': 0.0196078431372549,
  'acc': 0.99,
  'tnr': 0.9899,
  'fpr': 0.0,
  'fnr': 0.01,
  'tpr': 0.0001,
  'precision': 1.0,
  'recall': 0.009900990099009901},
 'obscene': {'auc': 0.9951877612413269,
  'f1': 0.746820809248555,
  'acc': 0.9781,
  'tnr': 0.9458,
  'fpr': 0.0015,
  'fnr': 0.0204,
  'tpr': 0.0323,
  'precision': 0.9556213017751479,
  'recall': 0.6129032258064516},
 'threat': {'auc': 0.9444865024277084,
  'f1': 0.0,
  'acc': 0.9967,
  'tnr': 0.9967,
  'fpr': 0.0,
  'fnr': 0.0033,
  'tpr': 0.0,
  'precision': nan,
  'recall': 0.0},
 'insult': {'auc': 0.9903444745317468,
  'f1': 0.6601941747572816,
  'acc': 0.972,
  'tnr': 0.9448,
  'fpr': 0.0058,
  'fnr': 0.0222,
  'tpr': 0.0272,
  'precision': 0.8242424242424242,


Global

In [80]:
label_metrics["global"] = {}
for mtrc in label_metrics["toxic"].keys():
    label_metrics["global"][mtrc] = np.mean([label_metrics[col][mtrc] for col in label_cols])

In [81]:
label_metrics["global"]

{'auc': 0.9811791405302467,
 'f1': 0.368829717151018,
 'acc': 0.9821166666666666,
 'tnr': 0.9612333333333334,
 'fpr': 0.0019333333333333331,
 'fnr': 0.015950000000000002,
 'tpr': 0.020883333333333334,
 'precision': nan,
 'recall': 0.30833892374829447}

# Record results and save weights

In [82]:
import sys
sys.path.append("../lib")

In [83]:
import record_experiments

Record summary

In [None]:
if not config.testing:
    experiment_log = dict(config)
    experiment_log.update(metrics)
    experiment_log.update(label_metrics)
    record_experiments.record(experiment_log)

Output tensorboard outputs and training logs to s3

(Remove weights since they take up too much space)

In [None]:
!rm {SER_DIR / "*.th"}

In [None]:
!ls {SER_DIR}

In [None]:
!aws s3 sync {SER_DIR} s3://nnfornlp/ckpts/{RUN_ID}