In [1]:
"""
Train a model on TACRED.
"""

import os
from datetime import datetime
import time
import numpy as np
import random
import argparse
from shutil import copyfile
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

from data.loader import DataLoader
from model.rnn import RelationModel
from utils import scorer, constant, helper
from utils.vocab import Vocab

In [2]:
print(torch.__version__)

0.3.0b0+591e73e


In [3]:
import argparse
import sys; sys.argv=['']; del sys  # this has to be done if argparse is used in the notebook
from datetime import datetime

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str, default='dataset/tacred')
parser.add_argument('--vocab_dir', type=str, default='dataset/vocab')
parser.add_argument('--emb_dim', type=int, default=300, help='Word embedding dimension.')
parser.add_argument('--ner_dim', type=int, default=30, help='NER embedding dimension.')
parser.add_argument('--pos_dim', type=int, default=30, help='POS embedding dimension.')
parser.add_argument('--hidden_dim', type=int, default=360, help='RNN hidden state size.')               # 200
parser.add_argument('--num_layers', type=int, default=2, help='Num of RNN layers.')
parser.add_argument('--dropout', type=float, default=0.1, help='Input and RNN dropout rate.')           # 0.5
parser.add_argument('--word_dropout', type=float, default=0.04,
                    help='The rate at which randomly set a word to UNK.'
                   )
parser.add_argument('--topn', type=int, default=1e10, help='Only finetune top N embeddings.')
parser.add_argument('--lower', dest='lower', action='store_true', help='Lowercase all words.')
parser.add_argument('--no-lower', dest='lower', action='store_false')
parser.set_defaults(lower=False)

parser.add_argument(
    '--self-attn', dest='self_att', action='store_true', 
    help='Use self-attention layer instead of LSTM.', default=True
)

parser.add_argument('--attn', dest='attn', action='store_true', help='Use attention layer.', default="true")
parser.add_argument('--no-attn', dest='attn', action='store_false')
parser.set_defaults(attn=True)

parser.add_argument('--attn_dim', type=int, default=360, help='Attention size.')                      # 200
parser.add_argument('--pe_dim', type=int, default=30, help='Position encoding dimension.')

parser.add_argument('--lr', type=float, default=1.0, help='Applies to SGD and Adagrad.')
parser.add_argument('--lr_decay', type=float, default=0.8)
parser.add_argument('--optim', type=str, default='sgd', help='sgd, adagrad, adam or adamax.')  # sgd
parser.add_argument('--num_epoch', type=int, default=100)                                       # 30
parser.add_argument('--batch_size', type=int, default=50)
parser.add_argument('--max_grad_norm', type=float, default=5.0, help='Gradient clipping.')
parser.add_argument('--log_step', type=int, default=20, help='Print log every k steps.')
parser.add_argument('--log', type=str, default='logs.txt', help='Write training log to file.')
parser.add_argument('--save_epoch', type=int, default=5, help='Save model checkpoints every k epochs.')
parser.add_argument('--save_dir', type=str, default='./saved_models', help='Root dir for saving models.')

parser.add_argument(
    '--id', type=str, 
    default='07_self_attention',                                 # change model folder output before running
    help='Model ID under which to save models.'
   )


parser.add_argument('--info', type=str, default='', help='Optional info for the experiment.')

parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
parser.add_argument('--cpu', action='store_true', help='Ignore CUDA.')

args = parser.parse_args()

In [5]:
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(1234)
if args.cpu:
    args.cuda = False
elif args.cuda:
    torch.cuda.manual_seed(args.seed)

# make opt
opt = vars(args)
opt['num_class'] = len(constant.LABEL_TO_ID)

# load vocab
vocab_file = opt['vocab_dir'] + '/vocab.pkl'
vocab = Vocab(vocab_file, load=True)
opt['vocab_size'] = vocab.size
emb_file = opt['vocab_dir'] + '/embedding.npy'
emb_matrix = np.load(emb_file)
assert emb_matrix.shape[0] == vocab.size
assert emb_matrix.shape[1] == opt['emb_dim']

# load data
print("Loading data from {} with batch size {}...".format(opt['data_dir'], opt['batch_size']))
train_batch = DataLoader(opt['data_dir'] + '/train.json', opt['batch_size'], opt, vocab, evaluation=False)
dev_batch = DataLoader(opt['data_dir'] + '/dev.json', opt['batch_size'], opt, vocab, evaluation=True)

model_id = opt['id'] if len(opt['id']) > 1 else '0' + opt['id']
model_save_dir = opt['save_dir'] + '/' + model_id
opt['model_save_dir'] = model_save_dir
helper.ensure_dir(model_save_dir, verbose=True)

# save config
helper.save_config(opt, model_save_dir + '/config.json', verbose=True)
vocab.save(model_save_dir + '/vocab.pkl')
file_logger = helper.FileLogger(model_save_dir + '/' + opt['log'], header="# epoch\ttrain_loss\tdev_loss\tdev_f1")

# print model info
helper.print_config(opt)

# model
model = RelationModel(opt, emb_matrix=emb_matrix)

id2label = dict([(v,k) for k,v in constant.LABEL_TO_ID.items()])
dev_f1_history = []
current_lr = opt['lr']

global_step = 0
global_start_time = time.time()
format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'
max_steps = len(train_batch) * opt['num_epoch']

Vocab size 55950 loaded from file
Loading data from dataset/tacred with batch size 50...
1501 batches created for dataset/tacred/train.json
516 batches created for dataset/tacred/dev.json
Config saved to file ./saved_models/06_self_attention/config.json
Overwriting old vocab file at ./saved_models/06_self_attention/vocab.pkl

Running with the following configs:
	data_dir : dataset/tacred
	vocab_dir : dataset/vocab
	emb_dim : 300
	ner_dim : 30
	pos_dim : 30
	hidden_dim : 360
	num_layers : 2
	dropout : 0.1
	word_dropout : 0.04
	topn : 10000000000.0
	lower : False
	self_att : True
	attn : True
	attn_dim : 360
	pe_dim : 30
	lr : 1.0
	lr_decay : 0.8
	optim : sgd
	num_epoch : 100
	batch_size : 50
	max_grad_norm : 5.0
	log_step : 20
	log : logs.txt
	save_epoch : 5
	save_dir : ./saved_models
	id : 06_self_attention
	info : 
	seed : 1234
	cuda : True
	cpu : False
	num_class : 42
	vocab_size : 55950
	model_save_dir : ./saved_models/06_self_attention


using self-attention
Finetune all embeddings

In [6]:
# start training
for epoch in range(1, opt['num_epoch']+1):
    train_loss = 0
    for i, batch in enumerate(train_batch):
        start_time = time.time()
        global_step += 1
        loss = model.update(batch)
        train_loss += loss
        if global_step % 400 == 0:   # if global_step % opt['log_step'] == 0:
            duration = time.time() - start_time
            print(
                format_str.format(datetime.now(), global_step, max_steps, epoch,
                opt['num_epoch'], loss, duration, current_lr)
            )

    # eval on dev
    print("Evaluating on dev set...")
    predictions = []
    dev_loss = 0
    for i, batch in enumerate(dev_batch):
        preds, _, loss = model.predict(batch)
        predictions += preds
        dev_loss += loss
        
    predictions = [id2label[p] for p in predictions]
    dev_p, dev_r, dev_f1 = scorer.score(dev_batch.gold(), predictions)
    
    train_loss = train_loss / train_batch.num_examples * opt['batch_size'] # avg loss per batch
    dev_loss = dev_loss / dev_batch.num_examples * opt['batch_size']
    print("epoch {}: train_loss = {:.6f}, dev_loss = {:.6f}, dev_f1 = {:.4f}".format(epoch,\
            train_loss, dev_loss, dev_f1))
    file_logger.log("{}\t{:.6f}\t{:.6f}\t{:.4f}".format(epoch, train_loss, dev_loss, dev_f1))

    # save
    model_file = model_save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)
    model.save(model_file, epoch)
    if epoch == 1 or dev_f1 > max(dev_f1_history):
        copyfile(model_file, model_save_dir + '/best_model.pt')
        print("new best model saved.")
    if epoch % opt['save_epoch'] != 0:
        os.remove(model_file)
    
    # lr schedule
    if len(dev_f1_history) > 5 and dev_f1 <= dev_f1_history[-1] and opt['optim'] in ['sgd', 'adagrad', 'adam']:
        current_lr *= opt['lr_decay']
        model.update_lr(current_lr)

    dev_f1_history += [dev_f1]
    print("")

print("Training ended with {} epochs.".format(epoch))

# !!!!!!!! change the model output folder !!!!!!!!!!!

  result = self.forward(*input, **kwargs)


2018-02-26 19:59:41.979854: step 400/150100 (epoch 1/100), loss = 0.817120 (0.036 sec/batch), lr: 1.000000
2018-02-26 19:59:58.152350: step 800/150100 (epoch 1/100), loss = 0.949206 (0.043 sec/batch), lr: 1.000000
2018-02-26 20:00:14.204526: step 1200/150100 (epoch 1/100), loss = 0.701192 (0.031 sec/batch), lr: 1.000000
Evaluating on dev set...
Precision (micro): 33.595%
   Recall (micro): 41.694%
       F1 (micro): 37.209%
epoch 1: train_loss = 1.261704, dev_loss = 0.945303, dev_f1 = 0.3721
model saved to ./saved_models/06_self_attention/checkpoint_epoch_1.pt
new best model saved.

2018-02-26 20:00:38.429926: step 1600/150100 (epoch 2/100), loss = 0.618000 (0.044 sec/batch), lr: 1.000000
2018-02-26 20:00:54.816989: step 2000/150100 (epoch 2/100), loss = 0.763590 (0.041 sec/batch), lr: 1.000000
2018-02-26 20:01:11.038110: step 2400/150100 (epoch 2/100), loss = 0.638795 (0.041 sec/batch), lr: 1.000000
2018-02-26 20:01:27.217622: step 2800/150100 (epoch 2/100), loss = 0.652719 (0.045 sec

KeyboardInterrupt: 