In [1]:
import sys
import torch
import math
import time
import importlib
from tqdm import tqdm
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch.optim import Adam
from tensorboardX import SummaryWriter

from cis700.dataset import DBPediaDataset, count_lines
from cis700 import utils

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


### Network Architecture

In [10]:
class WordEmbedding(nn.Module):
  def __init__(self, vocab_size, dim_embedding):
    super(WordEmbedding, self).__init__()
    self.embed = nn.Embedding(vocab_size, dim_embedding)
  def forward(self, x):
    return self.embed(x)
  
class PositionEncoding(nn.Module):
  def __init__(self, dim_embedding, max_seq_len):
    super(PositionEncoding, self).__init__()
    self.dim_embedding = dim_embedding
    
    pe = torch.zeros(max_seq_len, dim_embedding)
    for pos in range(max_seq_len):
      for i in range(0, dim_embedding, 2):
        pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / dim_embedding)))
        pe[pos, i+1] = math.cos(pos / (10000 ** ((2 * (i+1)) / dim_embedding)))
        
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)
    
  def forward(self, x):
    x = x * math.sqrt(self.dim_embedding)
    x_len = x.size(1)
    x = x + Variable(self.pe[:,:x_len], requires_grad=False).cuda()
    return x
  
class MultiheadAttention(nn.Module):
  def __init__(self, num_heads, dim_embedding, dropout = 0.1):
    super(MultiheadAttention, self).__init__()
    
    self.dim_embedding = dim_embedding
    self.dim_k = dim_embedding / num_heads
    if int(self.dim_k) != self.dim_k:
      raise ValueError('num_heads should divide dim_embedding evenly! num_heads = %d, dim_embedding = %d' \
                       % (num_heads, dim_embedding))
    self.dim_k = int(self.dim_k)
    self.num_heads = num_heads
    
    self.q_linear = nn.Linear(dim_embedding, dim_embedding)
    self.v_linear = nn.Linear(dim_embedding, dim_embedding)
    self.k_linear = nn.Linear(dim_embedding, dim_embedding)
    
    self.dropout = nn.Dropout(dropout)
    
    self.out = nn.Linear(dim_embedding, dim_embedding)
    
  def attention(self, q, v, k, mask):
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.dim_k)
    # print('q.size = %s' % str(q.size()))
    # print('scores.size = %s' % str(scores.size()))
    mask = mask.unsqueeze(1).unsqueeze(1)
    scores = scores.masked_fill(mask == 0, -1e9)
    
    # print(scores.size())
    scores = F.softmax(scores, dim=-1)
    # print(scores.size())
    scores = self.dropout(scores)
    
    return torch.matmul(scores, v)
    
  def forward(self, q, v, k, mask):
    batch_size = q.size(0)
    
    q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.dim_k)
    v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.dim_k)
    k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.dim_k)
    
    q = q.transpose(1, 2)
    v = v.transpose(1, 2)
    k = k.transpose(1, 2)
    
    scores = self.attention(q, v, k, mask)
    scores = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.dim_embedding)
    return scores
  
class FeedForward(nn.Module):
  def __init__(self, dim_embedding, num_features, dropout = 0.1):
    super(FeedForward, self).__init__()
    self.fc1 = nn.Linear(dim_embedding, num_features)
    self.dropout = nn.Dropout(dropout)
    self.fc2 = nn.Linear(num_features, dim_embedding)
    
  def forward(self, x):
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout(x)
    return self.fc2(x)
  
class Normalization(nn.Module):
  def __init__(self, dim_embedding, eps = 1e-6):
    super(Normalization, self).__init__()
    
    self.dim_embedding = dim_embedding
    self.alpha = nn.Parameter(torch.ones(self.dim_embedding))
    self.bias = nn.Parameter(torch.zeros(self.dim_embedding))
    self.eps = eps
    
  def forward(self, x):
    norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
    return norm
  
class EncoderLayer(nn.Module):
  def __init__(self, num_heads, dim_embedding, ff_num_features, dropout=0.1):
    super(EncoderLayer, self).__init__()
    self.attention = MultiheadAttention(num_heads, dim_embedding)
    self.norm1 = Normalization(dim_embedding)
    self.ff = FeedForward(dim_embedding, ff_num_features)
    self.norm2 = Normalization(dim_embedding)
    self.drop1 = nn.Dropout(dropout)
    self.drop2 = nn.Dropout(dropout)
    
  def forward(self, x, mask):
    x_ = self.norm1(x)
    x = x + self.drop1(self.attention(x_, x_, x_, mask))
    x_ = self.norm2(x)
    x = x + self.drop2(self.ff(x_))
    return x
  
class Encoder(nn.Module):
  def __init__(self, vocab_size, dim_embedding, num_heads, ff_num_features, num_encoder_layers, max_seq_len):
    super(Encoder, self).__init__()
    self.num_encoder_layers = num_encoder_layers
    self.embed = WordEmbedding(vocab_size, dim_embedding)
    self.pe = PositionEncoding(dim_embedding, max_seq_len)
    self.encoder_layers = \
      nn.ModuleList([EncoderLayer(num_heads, dim_embedding, ff_num_features) for _ in range(num_encoder_layers)])
    self.norm = Normalization(dim_embedding)
    
  def forward(self, x, mask):
    x = self.embed(x)
    x = self.pe(x)
    for i in range(self.num_encoder_layers):
      x = self.encoder_layers[i](x, mask)
    return self.norm(x)
  
class Classifier(nn.Module):
  def __init__(self, 
               vocab_size, dim_embedding, num_heads, 
               ff_num_features, num_encoder_layers, 
               max_seq_len, num_classes):
    super(Classifier, self).__init__()
    self.encoder = Encoder(vocab_size, dim_embedding, num_heads, ff_num_features, num_encoder_layers, max_seq_len)
    self.fc = nn.Linear(dim_embedding * max_seq_len, num_classes)
    
  def forward(self, x, mask):
    x = self.encoder(x, mask)
    return self.fc(x.view(-1, dim_embedding * max_seq_len))
  
def initialize_model(model):
  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)
      
def save_model(model, name, epoch, step):
  filename = 'checkpoints/%s.epoch-%d.step-%d.pth' % (name, epoch, step)
  torch.save(model.state_dict(), filename)

### Experiment 1

In [12]:
# configurations
max_seq_len        = 256
dim_embedding      = 200
num_heads          = 5
num_encoder_layers = 6
ff_num_features    = 1024
vocab_size         = count_lines('cis700/vocab/bert-base-uncased-vocab.txt')
batch_size         = 50

network_name = 'transformer-s%d-e%d-h%d-l%d' % (max_seq_len, dim_embedding, num_heads, num_encoder_layers)

In [4]:
dbpedia_data = DBPediaDataset('/Users/hengchu/Downloads/cis700data/joinedlonabstract_en.nt', max_seq_len=max_seq_len)

100%|██████████| 1384619/1384619 [00:09<00:00, 141536.96it/s]
100%|██████████| 1384619/1384619 [12:32<00:00, 2253.37it/s]


In [13]:
train_portion = 0.8
validation_portion = 0.1

train_size = int(len(dbpedia_data) * train_portion)
validation_size = int(len(dbpedia_data) * validation_portion)
test_size = len(dbpedia_data) - train_size - validation_size

train_set, validation_set, test_set = \
  torch.utils.data.random_split(dbpedia_data, [train_size, validation_size, test_size])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size)

In [14]:
# gather number of categories
num_fine_classes = dbpedia_data.num_fine_cats()
num_coarse_classes = dbpedia_data.num_coarse_cats()

In [None]:
device = torch.device('cuda:0')
"""
model = Classifier(vocab_size, dim_embedding, num_heads,
                   ff_num_features, num_encoder_layers, 
                   max_seq_len, num_fine_classes)
initialize_model(model)
model = model.cuda()
optimizer = Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
loss_fun = nn.CrossEntropyLoss()
tb_writer = SummaryWriter('./logs')
epochs = 10

step  = 0
epoch = 0
"""
for i in range(epoch, epochs):
  start_time = time.time()
  loss_val, validation_score, train_score = None, None, None
  for i_batch, batch_data in enumerate(train_loader):
    ids = batch_data[0].type(torch.LongTensor)
    masks = batch_data[1]
    fine_cats = batch_data[2]
    loss_val, train_score = utils.transformer_train(model, ids.cuda(), masks.cuda(), fine_cats.cuda(), loss_fun, optimizer)
    tb_writer.add_scalar('%s/loss_val' % network_name, loss_val, global_step=step)
    tb_writer.add_scalar('%s/train_acc' % network_name, train_score, global_step=step)
    step += 1
    if step % 5000 == 0:
      save_model(model, network_name, epoch, step)
  validation_score = utils.transformer_validate(model, validation_loader, device)
  tb_writer.add_scalar('%s/validation_acc' % network_name, validation_score, global_step=step)
  end_time = time.time()
  epoch = i
  print('Epoch[%d] took %f seconds' % (epoch, end_time - start_time))
  print('train = %f, validation = %f, loss = %f' % (train_score, validation_score, loss_val))

Epoch[0] took 4212.653124 seconds
train = 0.555556, validation = 0.458223, loss = 1.944845
Epoch[1] took 4202.755259 seconds
train = 0.600000, validation = 0.459624, loss = 1.872153
Epoch[2] took 4239.633796 seconds
train = 0.688889, validation = 0.453391, loss = 1.285751
Epoch[3] took 4304.622383 seconds
train = 0.622222, validation = 0.446739, loss = 1.317384


In [None]:
save_model(model, network_name, epoch, step)

### Sanity check (please ignore)

In [13]:
try:
  if model:
    del model
except:
  pass

model = Classifier(vocab_size, dim_embedding, num_heads, ff_num_features, num_encoder_layers, max_seq_len, num_fine_classes)
model = model.cuda()
_, first_batch = next(enumerate(train_loader))
ids = first_batch[0].type(torch.LongTensor).cuda()
masks = first_batch[1].cuda()
r = model(ids, masks)
print(r)
print(r.size())
del model

tensor([[ 0.6255, -0.4053, -0.3875,  ..., -0.2081, -0.2709, -0.5915],
        [ 0.4046, -0.6781, -0.9631,  ..., -0.0211, -0.0456, -0.4453],
        [ 0.2149, -0.5845, -0.5949,  ...,  0.0097, -0.9538, -0.2194],
        ...,
        [ 0.4937, -0.8904, -0.3721,  ..., -0.3162, -0.0797, -0.4613],
        [ 0.3535, -0.2306,  0.0034,  ...,  1.3421, -0.1081,  0.0705],
        [ 0.7894, -1.1255, -0.3667,  ..., -0.2754,  0.3495,  0.2130]],
       device='cuda:0', grad_fn=<AddmmBackward>)
torch.Size([50, 370])


### Scratch cells (please ignore)

In [16]:
importlib.reload(utils)

<module 'cis700.utils' from '/Users/hengchu/Documents/fun/cis700project/cis700/utils.py'>

In [12]:
del model

In [11]:
save_model(model, network_name, epoch, step)

In [18]:
utils.transformer_validate(model, validation_loader, device)

KeyboardInterrupt: 

In [20]:
loss_val

tensor(2.0368, device='cuda:0', grad_fn=<NllLossBackward>)

In [21]:
validation_score

In [22]:
test_score

tensor(0.4496, device='cuda:0')