In [1]:
from pathlib import Path
from typing import *
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token

TODOs:

- Make compatible with GPU
- Try replicating SST results
- Write results to MongoDB Atlas
- Store weights in s3

In [2]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)

In [3]:
# for papermill
testing = True
seed = 1
computational_batch_size = 4
batch_size = 16
lr = 5e-5
epochs = 3
embed_dim = 256
hidden_sz = 768
dataset = "sst-2"
n_classes = 2
max_seq_len = 512
bert_model = "bert-base-cased"
run_id = "replicate_0"

In [4]:
# TODO: Can we make this play better with papermill?
config = Config(
    testing=testing,
    seed=seed,
    computational_batch_size=computational_batch_size,
    batch_size=batch_size, # This is probably too large: need to handle effective v.s. machine batch size
    lr=lr,
    epochs=epochs,
    embed_dim=embed_dim,
    hidden_sz=hidden_sz,
    dataset=dataset,
    n_classes=n_classes,
    max_seq_len=max_seq_len, # necessary to limit memory usage
#     bert_model=None,
    bert_model=bert_model,
    run_id=run_id,
)

In [5]:
from allennlp.common.checks import ConfigurationError

In [6]:
if config.computational_batch_size * config.max_seq_len > 32 * 128:
    raise ConfigurationError(f"Batch size {config.computational_batch_size} too large for seq len {config.max_seq_len}")

In [7]:
if config.batch_size % config.computational_batch_size != 0:
    raise ConfigurationError(f"Computational batch size {config.computational_batch_size} "
                     + f"must be mutiple of batch size {config.batch_size}")

In [8]:
if config.batch_size < config.computational_batch_size:
    raise ConfigurationError(f"Computational batch size {config.computational_batch_size} "
                     + f"must be smaller than batch size {config.batch_size}")

In [9]:
import datetime
now = datetime.datetime.now()
RUN_ID = config.run_id if config.run_id is not None else now.strftime("%m_%d_%H:%M:%S")

In [10]:
USE_GPU = torch.cuda.is_available()

In [11]:
DATA_ROOT = Path("../data") / config.dataset

Set random seed manually to replicate results

In [12]:
torch.manual_seed(config.seed)

<torch._C.Generator at 0x113f1fe50>

# Load Data

In [13]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader, StanfordSentimentTreeBankDatasetReader

### Prepare dataset

In [14]:
import glob

In [15]:
reader_registry = {}

In [16]:
def register(name: str):
    def dec(x: Callable):
        reader_registry[name] = x
        return x
    return dec

In [17]:
@register("jigsaw")
class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None, # TODO: Handle mapping from BERT
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token], label: str = None) -> Instance:
        # TODO: Reimplement
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}

        label_field = LabelField(label=label, skip_indexing=True)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(10000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["comment_text"])],
                row["toxic"]
            )

In [18]:
@register("imdb")
class IMDBDatasetReader(DatasetReader):
    def __init__(self, tokenizer=None, 
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_len=None) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len
    
    @overrides
    def text_to_instance(self, tokens: List[Token], label: str = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
        
        # TODO: Add statistical features?

        label_field = LabelField(label=label)
        fields["label"] = label_field

        return Instance(fields)

    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        # TODO: Implement
        for label in ["pos", "neg"]:
            for file in (Path(file_path) / label).glob("*.txt"):
                text = file.open("rt", encoding="utf-8").read()
                yield self.text_to_instance([Token(word) for word in self.tokenizer(text)], 
                                            label)

In [19]:
@register("sst")
class SSTDatasetReader(StanfordSentimentTreeBankDatasetReader):
    def __init__(self, *args, tokenizer=None, **kwargs):
        super().__init__(*args, granularity=f"{config.n_classes}-class", **kwargs)
        self._tokenizer = tokenizer
        
    @overrides
    def text_to_instance(self, tokens: List[str], sentiment: str=None) -> Instance:
        """
        Forcibly re-tokenize the input to be wordpiece tokenized
        """
        tokens = self._tokenizer(" ".join(tokens))
        return super().text_to_instance(tokens, sentiment=sentiment)

In [36]:
@register("sst-2") # different from SST!!
class SST2DatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None, # TODO: Handle mapping from BERT
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token], label: str = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}

        label_field = LabelField(label=label, skip_indexing=True)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path, sep="\t")
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["sentence"])],
                row["label"] if "label" in row else None
            )

### Prepare token handlers

In [37]:
from allennlp.data.token_indexers import WordpieceIndexer, SingleIdTokenIndexer
from pytorch_pretrained_bert.tokenization import BertTokenizer

class BertIndexerCustom(WordpieceIndexer):
    """
    Virtually the same as PretrainedWordIndexer, except exposes more options.
    """
    def __init__(self, pretrained_model: str,
                 use_starting_offsets: bool = False,
                 do_lowercase: bool = True,
                 never_lowercase: List[str] = None,
                 max_pieces: int = 512,
                 start_tokens=["[CLS]"],
                 end_tokens=["[SEP]"]) -> None:
        assert not (pretrained_model.endswith("-cased") and do_lowercase)
        assert not (pretrained_model.endswith("-uncased") and not do_lowercase)
        bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                                       do_lower_case=do_lowercase)
        super().__init__(vocab=bert_tokenizer.vocab,
                         wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize,
                         namespace="bert",
                         use_starting_offsets=use_starting_offsets,
                         max_pieces=max_pieces,
                         do_lowercase=do_lowercase,
                         never_lowercase=never_lowercase,
                         start_tokens=start_tokens,
                         end_tokens=end_tokens)

In [38]:
if config.bert_model is not None:
    token_indexer = BertIndexerCustom(
        pretrained_model=config.bert_model,
        max_pieces=config.max_seq_len,
        do_lowercase="uncased" in config.bert_model,
     )
    # apparently we need to truncate the sequence here, which is a stupid design decision
    def tokenizer(s: str):
        return token_indexer.wordpiece_tokenizer(s)[:config.max_seq_len - 2]
else:
    token_indexer = SingleIdTokenIndexer(
        lowercase_tokens=False,  # don't lowercase by default
    )
    tokenizer = lambda x: x.split()

01/19/2019 08:51:22 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /Users/keitakurita/.pytorch_pretrained_bert/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1


In [39]:
reader_cls = reader_registry[config.dataset]
reader = reader_cls(tokenizer=tokenizer,
                    token_indexers={"tokens": token_indexer})

In [40]:
if config.dataset == "imdb":
    data_dir = DATA_ROOT / "imdb" / "aclImdb"
    train_ds, test_ds = (reader.read(data_dir / fname) for fname in ["train", "test"])
    val_ds = None
elif config.dataset == "sst":
    data_dir = DATA_ROOT / "trees"
    train_ds, val_ds, test_ds = (reader.read(data_dir / fname) for fname in ["train.txt", "dev.txt", "test.txt"])
elif config.dataset == "jigsaw":
    train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"])
elif config.dataset == "sst-2":
    train_ds, val_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.tsv", "dev.tsv"])
    test_ds = None


0it [00:00, ?it/s][A
6it [00:00, 59.96it/s][A
467it [00:00, 85.18it/s][A
67349it [00:13, 4985.29it/s]
872it [00:00, 4555.28it/s]


In [41]:
len(train_ds)

67349

### Prepare vocabulary

In [42]:
vocab = Vocabulary.from_instances(train_ds)
if config.bert_model is not None: 
    token_indexer._add_encoding_to_vocabulary(vocab)

01/19/2019 08:51:36 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.
100%|██████████| 67349/67349 [00:00<00:00, 252190.56it/s]


### Prepare iterator

In [43]:
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator

In [44]:
# TODO: Allow for customization
iterator = BucketIterator(batch_size=config.batch_size, 
                          biggest_batch_first=True,
                          sorting_keys=[("tokens", "num_tokens")],
                         )
iterator.index_with(vocab)

### Read sample

In [45]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    batch = next(iter(iterator(train_ds)))

In [46]:
batch

{'tokens': {'tokens': tensor([[  101,  1272,  1157, 22593,  6639,  2953, 12788,  1158,  3981,  1116,
             117,  1489, 28137,  4980,  1813, 28137, 11015, 11580,  3740, 23639,
            1605, 11266,  1320,   117,   127, 28137,  4980,  1813, 28137, 11015,
            3583,  2927,  1616,  4982,  1105,  1275, 28137,  4980,  1813, 28137,
           11015,  1119,  1179,  1616, 24438,  7903,  1116,   117,  7627,  1366,
            1104,  1103,  3796,  1104,  1103, 10228,   117,   192,  3708,  3540,
           11972,  1121,   170,  1677,  7138,  5015,   102,     0,     0,     0],
          [  101,   112,  1116,  1662,  1106,  5403,  2256,  7204,  1106,  8991,
             170,  2523,  1136,  1178,  1121, 22572, 26464, 10734,  4703,  2851,
             179,  9899,   176,  7777,  7836,  2328,  1348,  1133,  1145,  1121,
            8587,   184, 26996,  1197,  5222, 28117,  9995, 21718, 13141,  1320,
             117,  6693,  1394, 16358,  3101,  1399,  1105, 16358,  6071, 11151,
       

In [47]:
batch["tokens"]["tokens"]

tensor([[  101,  1272,  1157, 22593,  6639,  2953, 12788,  1158,  3981,  1116,
           117,  1489, 28137,  4980,  1813, 28137, 11015, 11580,  3740, 23639,
          1605, 11266,  1320,   117,   127, 28137,  4980,  1813, 28137, 11015,
          3583,  2927,  1616,  4982,  1105,  1275, 28137,  4980,  1813, 28137,
         11015,  1119,  1179,  1616, 24438,  7903,  1116,   117,  7627,  1366,
          1104,  1103,  3796,  1104,  1103, 10228,   117,   192,  3708,  3540,
         11972,  1121,   170,  1677,  7138,  5015,   102,     0,     0,     0],
        [  101,   112,  1116,  1662,  1106,  5403,  2256,  7204,  1106,  8991,
           170,  2523,  1136,  1178,  1121, 22572, 26464, 10734,  4703,  2851,
           179,  9899,   176,  7777,  7836,  2328,  1348,  1133,  1145,  1121,
          8587,   184, 26996,  1197,  5222, 28117,  9995, 21718, 13141,  1320,
           117,  6693,  1394, 16358,  3101,  1399,  1105, 16358,  6071, 11151,
           117,  1870, 25551,  8468,  7836,   185, 

In [48]:
batch["tokens"]["tokens"].shape

torch.Size([5, 70])

# Prepare Model

In [49]:
import torch
import torch.nn as nn
import torch.optim as optim

In [50]:
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.token_embedders.bert_token_embedder import BertEmbedder, PretrainedBertEmbedder
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.stacked_bidirectional_lstm import StackedBidirectionalLstm
from allennlp.nn.util import get_text_field_mask

In [51]:
class LstmEncoder(nn.Module):
    def __init__(self, lstm):
        super().__init__()
        self.lstm = lstm
        
    def forward(self, x, mask): # TODO: replace with allennlp built in modules
        _, (state, _) = self.lstm(x)
        state = torch.cat([state[0, :, :], state[1, :, :]], dim=1)
        return state

In [52]:
class BertPooler(nn.Module):
    """Source code copied"""
    def __init__(self):
        super().__init__()
        self.dense = nn.Linear(768, 768)
        self.activation = nn.Tanh()

    def forward(self, hidden_states, mask):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

In [53]:
class SentimentAnalysisModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: StackedBidirectionalLstm,
                 out_sz=config.n_classes):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
#         self.projection = nn.Linear(encoder.get_output_dim(), out_sz)
        self.projection = nn.Linear(config.hidden_sz, out_sz)
        self.accuracy = CategoricalAccuracy()
        
    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens["tokens"])
        state = self.encoder(embeddings, mask)

        class_logits = self.projection(state)
        
        output = {"class_logits": class_logits}
        if label is not None:
            output["accuracy"] = self.accuracy(class_logits, label, None)
            output["loss"] = nn.CrossEntropyLoss()(class_logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [54]:
if config.bert_model is None:
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=config.embed_dim)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    # encoder = PytorchSeq2VecWrapper(nn.LSTM(config.embed_dim, config.hidden_sz, batch_first=True,
    #                                         bidirectional=True))
    encoder = LstmEncoder(
        nn.LSTM(config.embed_dim, config.hidden_sz, batch_first=True, bidirectional=True)
    )

else:
    word_embeddings = PretrainedBertEmbedder(
        pretrained_model=config.bert_model,
        top_layer_only=True, # conserve memory
    )
    encoder = BertPooler()

01/19/2019 08:51:43 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz from cache at /Users/keitakurita/.pytorch_pretrained_bert/a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c
01/19/2019 08:51:43 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/keitakurita/.pytorch_pretrained_bert/a803ce83ca27fecf74c355673c434e51c265fb8a3e0e57ac62a80e38ba98d384.681017f415dfb33ec8d0e04fe51a619f3f01532ecea04edbfd48c5d160550d9c to temp dir /var/folders/hy/1czs1y5j2d58zgkqx6w_wnpw0000gn/T/tmpp8fwfrl1
01/19/2019 08:51:47 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 

In [55]:
model = SentimentAnalysisModel(
    word_embeddings, 
    encoder, 
    out_sz=config.n_classes,
)

In [56]:
if USE_GPU: model.cuda()
else: model

### Basic sanity checks

In [57]:
np.isnan(list(model.word_embeddings.parameters())[0].detach().numpy()).any()

False

In [58]:
[np.isnan(x.detach().numpy()).any() for x in list(model.encoder.parameters())]

[False, False]

In [59]:
[np.isinf(x.detach().numpy()).any() for x in list(model.encoder.parameters())]

[False, False]

In [60]:
tokens = batch["tokens"]
encoder(model.word_embeddings(tokens["tokens"]), get_text_field_mask(tokens))

tensor([[-0.2596,  0.4273, -0.2406,  ..., -0.4589, -0.2846, -0.0843],
        [-0.3921,  0.5188, -0.4173,  ..., -0.7065, -0.3623, -0.0750],
        [ 0.0432,  0.2216, -0.0860,  ..., -0.5040, -0.2837,  0.0297],
        [-0.2084,  0.3396, -0.2487,  ..., -0.6313, -0.2363, -0.0369],
        [-0.0938,  0.4056, -0.3456,  ..., -0.6304, -0.2349,  0.2175]],
       grad_fn=<TanhBackward>)

In [61]:
loss = model(**batch)["loss"]

In [62]:
nn.CrossEntropyLoss()(model(**batch)["class_logits"][:10, :], batch["label"][:10])

tensor(0.6632, grad_fn=<NllLossBackward>)

In [63]:
loss

tensor(0.6479, grad_fn=<NllLossBackward>)

In [64]:
loss.backward()

In [65]:
[x.grad for x in list(model.encoder.parameters())]

[tensor([[-4.8972e-03, -8.8047e-04,  2.4230e-04,  ...,  3.4703e-03,
          -1.1962e-03, -4.8932e-04],
         [-7.3226e-04, -1.2156e-04,  3.5136e-05,  ...,  5.1501e-04,
          -1.7994e-04, -7.3413e-05],
         [ 8.6812e-03,  1.7256e-03, -3.5987e-04,  ..., -6.2586e-03,
           2.1314e-03,  1.0785e-03],
         ...,
         [ 3.4397e-03,  6.3277e-04, -1.5470e-04,  ..., -2.4107e-03,
           8.4264e-04,  2.1589e-04],
         [ 9.8634e-04,  1.9345e-04, -5.7689e-05,  ..., -6.8664e-04,
           2.5014e-04,  8.4279e-05],
         [ 8.0626e-03,  1.5895e-03, -3.9801e-04,  ..., -5.7370e-03,
           1.9845e-03,  8.0873e-04]]),
 tensor([-7.6779e-03, -1.1523e-03,  1.3969e-02, -2.6948e-03,  1.2352e-02,
          5.0405e-03,  1.0513e-02, -6.8732e-03, -1.1279e-02, -7.0509e-03,
          2.4282e-03, -1.3056e-03, -5.9292e-03, -6.9618e-03,  9.3582e-03,
         -6.5148e-03, -8.8608e-04, -5.4255e-03,  4.4703e-04, -1.0966e-03,
          1.5481e-02, -7.8649e-04, -8.0571e-03, -8.9866e-0

# Train

In [66]:
from allennlp.training import trainer as _trainer
from allennlp.training.trainer import *
import math
logger = _trainer.logger

N_BATCHES_PER_UPDATE = config.batch_size // config.computational_batch_size

class CustomTrainer(Trainer):
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics. Copied from source
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        # Get tqdm for the training batches
        train_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        num_training_batches = self.iterator.get_num_batches(self.train_data)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                    batch_num_total % self._histogram_interval == 0)

            self.optimizer.zero_grad()
            
            ###########
            # Custom  #
            ###########
            loss = self.batch_loss(batch, for_training=True)
            if torch.isnan(loss):
                raise ValueError("nan loss encountered")
            train_loss += loss.item()
            # wait to update
            if (batches_this_epoch % N_BATCHES_PER_UPDATE) != 0: continue
            ###############
            # End Custom  #
            ###############
            
            loss.backward()
            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self.model.named_parameters()}
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7),
                                                       batch_num_total)
            else:
                self.optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if batch_num_total % self._summary_interval == 0:
                if self._should_log_parameter_statistics:
                    self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm)
                if self._should_log_learning_rate:
                    self._learning_rates_to_tensorboard(batch_num_total)
                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total)
                self._metrics_to_tensorboard(batch_num_total,
                                             {"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._log_histograms_this_batch:
                self._histograms_to_tensorboard(batch_num_total, histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = self._get_batch_size(batch)
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size/batches_this_epoch
                    logger.info(f"current batch size: {cur_batch} mean batch size: {average}")
                    self._tensorboard.add_train_scalar("current_batch_size", cur_batch, batch_num_total)
                    self._tensorboard.add_train_scalar("mean_batch_size", average, batch_num_total)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False
                )
        metrics = self._get_metrics(train_loss, batches_this_epoch, reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory
        return metrics

In [67]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [68]:
training_options = {
    # TODO: Add appropriate learning rate scheduler
    "should_log_parameter_statistics": True,
    "should_log_learning_rate": True,
    "num_epochs": config.epochs,
}

In [69]:
SER_DIR = DATA_ROOT / "ckpts" / RUN_ID

trainer = CustomTrainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_ds,
    validation_dataset=val_ds,
    serialization_dir=SER_DIR,
    cuda_device=0 if USE_GPU else -1,
    **training_options,
)



In [70]:
metrics = trainer.train()

01/19/2019 08:52:31 - INFO - allennlp.training.trainer -   Beginning training.
01/19/2019 08:52:31 - INFO - allennlp.training.trainer -   Epoch 0/2
01/19/2019 08:52:31 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 1479.880704
01/19/2019 08:52:31 - INFO - allennlp.training.trainer -   Training
accuracy: 0.7746, loss: 0.4878 ||: 100%|██████████| 4210/4210 [43:09<00:00,  1.43it/s]  
01/19/2019 09:35:40 - INFO - allennlp.training.trainer -   Validating
accuracy: 0.8222, loss: 0.3901 ||: 100%|██████████| 55/55 [00:52<00:00,  1.65s/it]
01/19/2019 09:36:33 - INFO - allennlp.training.trainer -                     Training |  Validation
01/19/2019 09:36:33 - INFO - allennlp.training.trainer -   cpu_memory_MB |  1479.881  |       N/A
01/19/2019 09:36:33 - INFO - allennlp.training.trainer -   loss          |     0.488  |     0.390
01/19/2019 09:36:33 - INFO - allennlp.training.trainer -   accuracy      |     0.775  |     0.822
01/19/2019 09:36:34 - INFO - allennlp.training.trai

{'peak_cpu_memory_MB': 1479.880704,
 'training_duration': '02:36:48',
 'training_start_epoch': 0,
 'training_epochs': 2,
 'epoch': 2,
 'training_accuracy': 0.8155726142927141,
 'training_loss': 0.40481197644033934,
 'training_cpu_memory_MB': 1479.880704,
 'validation_accuracy': 0.8130733944954128,
 'validation_loss': 0.39849679117852993,
 'best_epoch': 1,
 'best_validation_accuracy': 0.8222477064220184,
 'best_validation_loss': 0.38654813874851573}

In [72]:
metrics

{'peak_cpu_memory_MB': 1479.880704,
 'training_duration': '02:36:48',
 'training_start_epoch': 0,
 'training_epochs': 2,
 'epoch': 2,
 'training_accuracy': 0.8155726142927141,
 'training_loss': 0.40481197644033934,
 'training_cpu_memory_MB': 1479.880704,
 'validation_accuracy': 0.8130733944954128,
 'validation_loss': 0.39849679117852993,
 'best_epoch': 1,
 'best_validation_accuracy': 0.8222477064220184,
 'best_validation_loss': 0.38654813874851573}

# Record results and save weights

In [73]:
import sys
sys.path.append("../lib")

In [74]:
import record_experiments

Record summary

In [76]:
experiment_log = dict(config)
experiment_log.update(metrics)
record_experiments.record(experiment_log)

[INFO] 2019-01-19 12:39:36,084 - record_experiments Inserted results at id 5c4360d772bcea8281b90c83
01/19/2019 12:39:36 - INFO - record_experiments -   Inserted results at id 5c4360d772bcea8281b90c83


<pymongo.results.InsertOneResult at 0x138ee6b08>

Output tensorboard outputs and training logs to s3

(Remove weights since they take up too much space)

In [85]:
!rm {SER_DIR / "*.th"}

In [87]:
!ls {SER_DIR}

best.th              metrics_epoch_0.json metrics_epoch_2.json
[1m[36mlog[m[m                  metrics_epoch_1.json


In [93]:
!aws s3 sync {SER_DIR} s3://nnfornlp/ckpts/{RUN_ID}

upload: ../data/sst-2/ckpts/replicate_0/metrics_epoch_1.json to s3://nnfornlp/ckpts/RUN_ID/metrics_epoch_1.json
upload: ../data/sst-2/ckpts/replicate_0/log/validation/events.out.tfevents.1547905948.Keitas-MacBook-Pro.local to s3://nnfornlp/ckpts/RUN_ID/log/validation/events.out.tfevents.1547905948.Keitas-MacBook-Pro.local
upload: ../data/sst-2/ckpts/replicate_0/metrics_epoch_0.json to s3://nnfornlp/ckpts/RUN_ID/metrics_epoch_0.json
upload: ../data/sst-2/ckpts/replicate_0/metrics_epoch_2.json to s3://nnfornlp/ckpts/RUN_ID/metrics_epoch_2.json
upload: ../data/sst-2/ckpts/replicate_0/log/train/events.out.tfevents.1547905948.Keitas-MacBook-Pro.local to s3://nnfornlp/ckpts/RUN_ID/log/train/events.out.tfevents.1547905948.Keitas-MacBook-Pro.local
upload: ../data/sst-2/ckpts/replicate_0/best.th to s3://nnfornlp/ckpts/RUN_ID/best.th
