In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as sched
import torch.utils.data as data
import util

from args import get_train_args
from collections import OrderedDict
from json import dumps
from models import BiDAF
#from tensorboardX import SummaryWriter
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from ujson import load as json_load
from util import collate_fn, SQuAD



In [2]:
from argparse import Namespace
argDict = {'train_record_file': './data/train.npz', 
        'dev_record_file': './data/dev.npz', 
        'test_record_file': './data/test.npz', 
        'word_emb_file': './data/word_emb.json', 
        'char_emb_file': './data/char_emb.json', 
        'train_eval_file': './data/train_eval.json', 
        'dev_eval_file': './data/dev_eval.json', 
        'test_eval_file': './data/test_eval.json', 
        'name': 'devNonPCE', 
        'max_ans_len': 15, 
        'num_workers': 4, 
        'save_dir': './save/', 
        'batch_size': 16, 
        'use_squad_v2': True, 
        'hidden_size': 100, 
        'num_visuals': 10, 
        'load_path': None, 
        'rnn_type': 'LSTM', 
        'char_embeddings': False, 
        'eval_steps': 5000, 
        'lr': 0.5, 'l2_wd': 0, 
        'num_epochs': 3, 
        'drop_prob': 0.2, 
        'metric_name': 'F1', 
        'max_checkpoints': 5, 
        'max_grad_norm': 5.0, 
        'seed': 224, 
        'ema_decay': 0.999, 
        'char_out_channels': 5, 
        'char_kernel_size': 100, 
        'maximize_metric': True,
        'gpu_ids': []}
args = Namespace(**argDict)


In [3]:
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    #if args.device_cpu:
    #    device = 'cpu',
    #    args.gpu_ids = []
    #else:
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))


[06.12.21 14:24:39] Args: {
    "batch_size": 16,
    "char_emb_file": "./data/char_emb.json",
    "char_embeddings": false,
    "char_kernel_size": 100,
    "char_out_channels": 5,
    "dev_eval_file": "./data/dev_eval.json",
    "dev_record_file": "./data/dev.npz",
    "drop_prob": 0.2,
    "ema_decay": 0.999,
    "eval_steps": 5000,
    "gpu_ids": [
        0
    ],
    "hidden_size": 100,
    "l2_wd": 0,
    "load_path": null,
    "lr": 0.5,
    "max_ans_len": 15,
    "max_checkpoints": 5,
    "max_grad_norm": 5.0,
    "maximize_metric": true,
    "metric_name": "F1",
    "name": "devNonPCE",
    "num_epochs": 3,
    "num_visuals": 10,
    "num_workers": 4,
    "rnn_type": "LSTM",
    "save_dir": "./save/train\\devNonPCE-01",
    "seed": 224,
    "test_eval_file": "./data/test_eval.json",
    "test_record_file": "./data/test.npz",
    "train_eval_file": "./data/train_eval.json",
    "train_record_file": "./data/train.npz",
    "use_squad_v2": true,
    "word_emb_file": "./data/word

In [5]:
torch.cuda.is_available()
print(args)

Namespace(batch_size=16, char_emb_file='./data/char_emb.json', char_embeddings=False, char_kernel_size=100, char_out_channels=5, dev_eval_file='./data/dev_eval.json', dev_record_file='./data/dev.npz', drop_prob=0.2, ema_decay=0.999, eval_steps=5000, gpu_ids=[0], hidden_size=100, l2_wd=0, load_path=None, lr=0.5, max_ans_len=15, max_checkpoints=5, max_grad_norm=5.0, maximize_metric=True, metric_name='F1', name='devNonPCE', num_epochs=3, num_visuals=10, num_workers=4, rnn_type='LSTM', save_dir='./save/train\\devNonPCE-01', seed=224, test_eval_file='./data/test_eval.json', test_record_file='./data/test.npz', train_eval_file='./data/train_eval.json', train_record_file='./data/train.npz', use_squad_v2=True, word_emb_file='./data/word_emb.json')


In [6]:
    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)


[06.12.21 14:24:55] Using random seed 224...


In [7]:
    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)


[06.12.21 14:25:04] Loading embeddings...


In [37]:
    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors = word_vectors,
                  char_vectors = char_vectors,
                  hidden_size=args.hidden_size,
                  rnn_type=args.rnn_type,
                  drop_prob=args.drop_prob)

    #if args.device_cpu:
    #    args.gpu_ids = []
    #    device = 'cpu'
    #else:
    #model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)


[06.12.21 14:59:58] Building model...


In [38]:
print(model)
device

BiDAF(
  (emb): Embedding(
    (word_embed): Embedding(88714, 300)
    (proj): Linear(in_features=300, out_features=100, bias=False)
    (char_embed): Embedding(1376, 64)
    (char_cnn): Conv1d(64, 100, kernel_size=(5,), stride=(1,))
    (hwy): HighwayEncoder(
      (transforms): ModuleList(
        (0): Linear(in_features=200, out_features=200, bias=True)
        (1): Linear(in_features=200, out_features=200, bias=True)
      )
      (gates): ModuleList(
        (0): Linear(in_features=200, out_features=200, bias=True)
        (1): Linear(in_features=200, out_features=200, bias=True)
      )
    )
  )
  (enc): RNNEncoder(
    (rnn): LSTM(200, 100, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (att): BiDAFAttention()
  (mod): RNNEncoder(
    (rnn): LSTM(800, 100, batch_first=True, bidirectional=True)
  )
  (att2): BiDAFAttention()
  (mod2): RNNEncoder(
    (rnn): LSTM(600, 100, batch_first=True, bidirectional=True)
  )
  (out): BiDAFOutput(
    (att_linear_1): 

device(type='cuda', index=0)

In [23]:
    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)


[06.12.21 14:54:32] Saver will maximize F1...


In [24]:
    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR


In [25]:
    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)



[06.12.21 14:54:35] Building dataset...


In [26]:
print(train_loader.__dir__())
print(train_loader.batch_sampler)

['dataset', 'num_workers', 'prefetch_factor', 'pin_memory', 'timeout', 'worker_init_fn', '_DataLoader__multiprocessing_context', '_dataset_kind', 'batch_size', 'drop_last', 'sampler', 'batch_sampler', 'generator', 'collate_fn', 'persistent_workers', '_DataLoader__initialized', '_IterableDataset_len_called', '_iterator', '__module__', '__annotations__', '__doc__', '__init__', '_get_iterator', 'multiprocessing_context', '__setattr__', '__iter__', '_auto_collation', '_index_sampler', '__len__', 'check_worker_number_rationality', '__orig_bases__', '__dict__', '__weakref__', '__parameters__', '__slots__', '_is_protocol', '__new__', '__class_getitem__', '__init_subclass__', '__repr__', '__hash__', '__str__', '__getattribute__', '__delattr__', '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', '__reduce_ex__', '__reduce__', '__subclasshook__', '__format__', '__sizeof__', '__dir__', '__class__']
<torch.utils.data.sampler.BatchSampler object at 0x0000021317A53D00>


In [39]:
    # get a training input
    cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids = next(iter(train_loader))
    cw_idxs=cw_idxs.to(device)
    cc_idxs=cc_idxs.to(device)
    qw_idxs=qw_idxs.to(device)
    qc_idxs=qc_idxs.to(device)

In [40]:
print('cw_idxs',cw_idxs.shape,cw_idxs.device)
print('cc_idxs',cc_idxs.shape)
print('qw_idxs',qw_idxs.shape)
print('qc_idxs',qc_idxs.shape)


cw_idxs torch.Size([16, 204]) cuda:0
cc_idxs torch.Size([16, 204, 16])
qw_idxs torch.Size([16, 22])
qc_idxs torch.Size([16, 22, 16])


In [43]:
    tbx.add_graph(model,[cw_idxs, cc_idxs, qw_idxs, qc_idxs])

In [None]:
    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()
                
                # Forward
                log_p1, log_p2 = model(cw_idxs,cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step() # removed parm step // batch_size per scheduler 1.8 release notes
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)

                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)
                
                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)

In [None]:
    hparms = args.__dict__.copy()
    hparms['gpu_ids']=str(args.gpu_ids)
    metrics =  dict([ ('met/'+k,v) for (k,v) in results.items()])
    tbx.add_hparams(hparms,metrics)