In [0]:
!pip install tensorboardX



In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

--2019-12-11 12:58:10--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 54.224.175.112, 34.206.134.194, 3.229.196.117, ...
Connecting to bin.equinox.io (bin.equinox.io)|54.224.175.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13773305 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip.1’


2019-12-11 12:58:11 (18.9 MB/s) - ‘ngrok-stable-linux-amd64.zip.1’ saved [13773305/13773305]

Archive:  ngrok-stable-linux-amd64.zip
replace ngrok? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ngrok                   


In [0]:
import os
import random
import argparse
import logging
import numpy as np
from tensorboardX import SummaryWriter

import data
import model
import utils

import torch
import torch.optim as optim
import torch.nn.functional as F

In [0]:
LOG_DIR = 'runs'
os.makedirs(LOG_DIR, exist_ok=True)
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)
get_ipython().system_raw('./ngrok http 6006 &')

! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

https://e55158fe.ngrok.io


In [0]:
SAVES_DIR = "saves"

BATCH_SIZE = 32
LEARNING_RATE = 1e-3
MAX_EPOCHS = 100

log = logging.getLogger("train")

TEACHER_PROB = 0.5

In [0]:
def run_test(test_data, net, end_token, device='cpu'):
  """Calculates the mean BLEU score for the held-out test dataset, which is 5% of laoded data by default."""
  bleu_sum = 0.0
  bleu_count = 0
  for p1, p2 in test_data:
    input_seq = model.pack_input(p1, net.emb, device)
    enc = net.encode(input_seq)
    _, tokens = net.decode_chain_argmax(enc, input_seq.data[0:1], seq_len=data.MAX_TOKENS, stop_at_token=end_token)
    bleu_sum += utils.calc_bleu(tokens, p2[1:])
    bleu_count += 1
  return bleu_sum / bleu_count

In [0]:
if __name__ == "__main__":
  logging.basicConfig(format="%(asctime)-15s %(levelname)s %(message)s", level=logging.INFO)
  
  name = "inigo"
  genre = "comedy" #specifies the film genre we want to train on
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  saves_path = os.path.join(SAVES_DIR, name)
  os.makedirs(saves_path, exist_ok=True)
  #load the dataset, save the embeddings dictionary(a mapping from the token's string to the integer ID of the token) and encode the phrase pairs.
  phrase_pairs, emb_dict = data.load_data(genre_filter=genre)
  log.info("Obtained %d phrase pairs with %d uniq words", len(phrase_pairs), len(emb_dict))
  data.save_emb_dict(saves_path, emb_dict)
  end_token = emb_dict[data.END_TOKEN]
  train_data = data.encode_phrase_pairs(phrase_pairs, emb_dict)
  #split data into tarin/test parts, shuffle data using a fixed random seed(to be able to repeat the same shuffle at RL training stage).
  rand = np.random.RandomState(data.SHUFFLE_SEED)
  rand.shuffle(train_data)
  log.info("Training data converted, got %d samples", len(train_data))
  train_data, test_data = data.split_train_test(train_data)
  log.info("Train set has %d phrases, test %d", len(train_data), len(test_data))
  #create the model
  net = model.PhraseModel(emb_size=model.EMBEDDING_DIM, dict_size=len(emb_dict), hid_size=model.HIDDEN_STATE_SIZE).to(device)
  log.info("Model: %s", net)
  writer = SummaryWriter(comment="-" + name)
  optimiser = optim.Adam(net.parameters(), lr=LEARNING_RATE)
  best_bleu = None

  for epoch in range(MAX_EPOCHS):#each epoch is an iteration over the batches of pairs of the encoded phrases
    losses = []
    bleu_sum = 0.0
    bleu_count = 0
    for batch in data.iterate_batches(train_data, BATCH_SIZE):
      optimiser.zero_grad()
      input_seq, out_seq_list, _ , out_idx = model.pack_batch(batch, net.emb, device) #pack every batch: packed input seq, packed output seq, input idx and output idx
      enc = net.encode(input_seq) #encode our input seq: outputs the RNNs hidden state of shape(BATCH_SIZE, HIDDEN_STATE_SIZE)
      #decode every sequence in our batch individually, getting as a result areference sequence of token IDs & the encoded rep of the in seq created  by the encoder
      net_results = []
      net_targets = []
      for idx, out_seq in enumerate(out_seq_list):
        ref_indices = out_idx[idx][1:]
        enc_item = net.get_encoded_item(enc, idx)
        if random.random() < TEACHER_PROB: #teacher-forcing learning
          r = net.decode_teacher(enc_item, out_seq)
          bleu_sum += model.seq_bleu(r, ref_indices)
        else: #curriculum learning
          r, seq = net.decode_chain_argmax(enc_item, out_seq.data[0:1], len(ref_indices))
          bleu_sum += utils.calc_bleu(seq, ref_indices)
        net_results.append(r)
        net_targets.extend(ref_indices)
        bleu_count += 1

      results_v = torch.cat(net_results)
      targets_v = torch.LongTensor(net_targets).to(device)
      loss_v = F.cross_entropy(results_v, targets_v)
      loss_v.backward()
      optimiser.step()
      losses.append(loss_v.item())

    bleu = bleu_sum / bleu_count
    bleu_test = run_test(test_data, net, end_token, device)
    log.info("Epoch %d: mena loss %.3f, mean BLEU %.3f, test BLEU %.3f", epoch, np.mean(losses), bleu, bleu_test)
    writer.add_scalar("loss", epoch, np.mean(losses))
    writer.add_scalar("bleu", epoch, bleu)
    writer.add_scalar("bleu_test", epoch, bleu_test)

    if best_bleu is None or best_bleu < bleu_test:
      if best_bleu is not None:
        out_name = os.path.join(saves_path, "pre_bleu_%.3f_%02.dat" % (bleu_test, epoch))
        torch.save(net.state_dict(), out_name)
        log.info("Best BLEU updated %.3f", bleu_test)
      best_bleu = bleu_test

    if epoch % 10 == 0:
      out_name = os.path.join(saves_path, "epoch_%03d_%.3f_%.3f_.dat" % (epoch, bleu, bleu_test))
      torch.save(net.state_dict(), out_name)
  
  writer.close()


2019-12-11 13:04:32,317 INFO Loaded 159 movies with genre comedy
2019-12-11 13:04:32,318 INFO Read and tokenise phrases...
2019-12-11 13:04:36,722 INFO Loaded 93039 phrases
2019-12-11 13:04:37,087 INFO Loaded 24716 dialogues with 93039 phrases, generating training pairs
2019-12-11 13:04:37,114 INFO Counting freq of words...
2019-12-11 13:04:37,454 INFO Data has 31774 uniq words, 4913 of them occur more than 10
2019-12-11 13:04:37,639 INFO Obtained 47644 phrase pairs with 4905 uniq words
2019-12-11 13:04:38,029 INFO Training data converted, got 26491 samples
2019-12-11 13:04:38,032 INFO Train set has 25166 phrases, test 1325
2019-12-11 13:04:41,014 INFO Model: PhraseModel(
  (emb): Embedding(4905, 50)
  (encoder): LSTM(50, 512, batch_first=True)
  (decoder): LSTM(50, 512, batch_first=True)
  (output): Sequential(
    (0): Linear(in_features=512, out_features=4905, bias=True)
  )
)
2019-12-11 13:06:47,880 INFO Epoch 0: mena loss 4.998, mean BLEU 0.156, test BLEU 0.055
2019-12-11 13:08:56