In [1]:
import sys
import torch
import math
import time
import importlib
import random
from tqdm import tqdm
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch.optim import Adam
from tensorboardX import SummaryWriter

from cis700.dataset import DBPediaDataset, count_lines
from cis700.tokenizer import build_tokenizer
from cis700 import utils

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


### Network Architecture

Here we develop the modules of our attention-based classifier. We had used the blog-post (https://towardsdatascience.com/how-to-code-the-transformer-in-pytorch-24db27c8f9ec) as reference.

`WordEmbedding`: a simple wrapper around `nn.Embedding`

`PositionEncoding`: a module that implements the sinuoid position encoding described in "Attention is All You Need"

`MultiheadAttention`: a module that implmenets scale dot product attention

`FeedForward`: a simple 2-layer feedforward module with sparsity and dropout

`Normalization`: a module that implements batch normalization

`EncoderLayer`: a module that puts together self-attention and feedforward module to form a single layer of encoder

`Encoder`: a module that places N `EncoderLayers` one after another

`Classifier`: encoder plus linear layer

In [2]:
class WordEmbedding(nn.Module):
  def __init__(self, vocab_size, dim_embedding):
    super(WordEmbedding, self).__init__()
    self.embed = nn.Embedding(vocab_size, dim_embedding)
  def forward(self, x):
    return self.embed(x)
  
class PositionEncoding(nn.Module):
  def __init__(self, dim_embedding, max_seq_len):
    super(PositionEncoding, self).__init__()
    self.dim_embedding = dim_embedding
    
    pe = torch.zeros(max_seq_len, dim_embedding)
    for pos in range(max_seq_len):
      for i in range(0, dim_embedding, 2):
        pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / dim_embedding)))
        pe[pos, i+1] = math.cos(pos / (10000 ** ((2 * (i+1)) / dim_embedding)))
        
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)
    
  def forward(self, x):
    x = x * math.sqrt(self.dim_embedding)
    x_len = x.size(1)
    x = x + Variable(self.pe[:,:x_len], requires_grad=False).cuda()
    return x
  
class MultiheadAttention(nn.Module):
  def __init__(self, num_heads, dim_embedding, dropout = 0.1):
    super(MultiheadAttention, self).__init__()
    
    self.dim_embedding = dim_embedding
    self.dim_k = dim_embedding / num_heads
    if int(self.dim_k) != self.dim_k:
      raise ValueError('num_heads should divide dim_embedding evenly! num_heads = %d, dim_embedding = %d' \
                       % (num_heads, dim_embedding))
    self.dim_k = int(self.dim_k)
    self.num_heads = num_heads
    
    self.q_linear = nn.Linear(dim_embedding, dim_embedding)
    self.v_linear = nn.Linear(dim_embedding, dim_embedding)
    self.k_linear = nn.Linear(dim_embedding, dim_embedding)
    
    self.dropout = nn.Dropout(dropout)
    
    self.out = nn.Linear(dim_embedding, dim_embedding)
    
  def attention(self, q, v, k, mask):
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.dim_k)
    # print('q.size = %s' % str(q.size()))
    # print('scores.size = %s' % str(scores.size()))
    mask = mask.unsqueeze(1).unsqueeze(1)
    scores = scores.masked_fill(mask == 0, -1e9)
    
    # print(scores.size())
    scores = F.softmax(scores, dim=-1)
    # print(scores.size())
    scores = self.dropout(scores)
    
    return torch.matmul(scores, v)
    
  def forward(self, q, v, k, mask):
    batch_size = q.size(0)
    
    q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.dim_k)
    v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.dim_k)
    k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.dim_k)
    
    q = q.transpose(1, 2)
    v = v.transpose(1, 2)
    k = k.transpose(1, 2)
    
    scores = self.attention(q, v, k, mask)
    scores = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.dim_embedding)
    return scores
  
class FeedForward(nn.Module):
  def __init__(self, dim_embedding, num_features, dropout = 0.1):
    super(FeedForward, self).__init__()
    self.fc1 = nn.Linear(dim_embedding, num_features)
    self.dropout = nn.Dropout(dropout)
    self.fc2 = nn.Linear(num_features, dim_embedding)
    
  def forward(self, x):
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout(x)
    return self.fc2(x)
  
class Normalization(nn.Module):
  def __init__(self, dim_embedding, eps = 1e-6):
    super(Normalization, self).__init__()
    
    self.dim_embedding = dim_embedding
    self.alpha = nn.Parameter(torch.ones(self.dim_embedding))
    self.bias = nn.Parameter(torch.zeros(self.dim_embedding))
    self.eps = eps
    
  def forward(self, x):
    norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
    return norm
  
class EncoderLayer(nn.Module):
  def __init__(self, num_heads, dim_embedding, ff_num_features, dropout=0.1):
    super(EncoderLayer, self).__init__()
    self.attention = MultiheadAttention(num_heads, dim_embedding)
    self.norm1 = Normalization(dim_embedding)
    self.ff = FeedForward(dim_embedding, ff_num_features)
    self.norm2 = Normalization(dim_embedding)
    self.drop1 = nn.Dropout(dropout)
    self.drop2 = nn.Dropout(dropout)
    
  def forward(self, x, mask):
    x_ = self.norm1(x)
    x = x + self.drop1(self.attention(x_, x_, x_, mask))
    x_ = self.norm2(x)
    x = x + self.drop2(self.ff(x_))
    return x
  
class Encoder(nn.Module):
  def __init__(self, vocab_size, dim_embedding, num_heads, ff_num_features, num_encoder_layers, max_seq_len):
    super(Encoder, self).__init__()
    self.num_encoder_layers = num_encoder_layers
    self.embed = WordEmbedding(vocab_size, dim_embedding)
    self.pe = PositionEncoding(dim_embedding, max_seq_len)
    self.encoder_layers = \
      nn.ModuleList([EncoderLayer(num_heads, dim_embedding, ff_num_features) for _ in range(num_encoder_layers)])
    self.norm = Normalization(dim_embedding)
    
  def forward(self, x, mask):
    x = self.embed(x)
    x = self.pe(x)
    for i in range(self.num_encoder_layers):
      x = self.encoder_layers[i](x, mask)
    return self.norm(x)
  
class Classifier(nn.Module):
  def __init__(self, 
               vocab_size, dim_embedding, num_heads, 
               ff_num_features, num_encoder_layers, 
               max_seq_len, num_classes):
    super(Classifier, self).__init__()
    self.dim_embedding = dim_embedding
    self.max_seq_len = max_seq_len
    self.encoder = Encoder(vocab_size, dim_embedding, num_heads, ff_num_features, num_encoder_layers, max_seq_len)
    self.fc = nn.Linear(dim_embedding * max_seq_len, num_classes)
    
  def forward(self, x, mask):
    x = self.encoder(x, mask)
    return self.fc(x.view(-1, self.dim_embedding * self.max_seq_len))
  

### Some utility functions working with models

In [3]:
def initialize_model(model):
  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)
      
def save_model(model, name, epoch, step):
  filename = 'checkpoints/%s.epoch-%d.step-%d.pth' % (name, epoch, step)
  torch.save(model.state_dict(), filename)

def human_readable_prediction(text, model, max_seq_len, cat_id2text_fun):
  tok = build_tokenizer()
  tokens = tok.tokenize(text)
  ids = tok.convert_tokens_to_ids(tokens)
  if len(ids) > max_seq_len:
    ids = ids[0:max_seq_len]
  masks = [1] * len(ids)
  ids += [0] * (max_seq_len - len(ids))
  masks += [0] * (max_seq_len - len(masks))
  
  model.eval()
  ids = torch.Tensor(ids).type(torch.LongTensor).cuda().unsqueeze(0)
  masks = torch.Tensor(masks).cuda().unsqueeze(0)
  scores = model(ids, masks).squeeze()
  top_k = 5
  _, max_cat_ids = torch.topk(scores, k = top_k, dim = 0)
  top_cats = [cat_id2text_fun(int(max_cat_ids[i])) for i in range(top_k)]
  return top_cats

def swap_linear_layer(model, new_num_classes):
  model.fc = nn.Linear(model.dim_embedding * model.max_seq_len, new_num_classes)
  return model.cuda()

### Experiment 1 (Baseline on fine categories)

In [3]:
# configurations
max_seq_len        = 256
dim_embedding      = 50
num_heads          = 5
num_encoder_layers = 6
ff_num_features    = 1024
vocab_size         = count_lines('cis700/vocab/bert-base-uncased-vocab.txt')
batch_size         = 50

network_name = 'transformer-fine-s%d-e%d-h%d-l%d' % (max_seq_len, dim_embedding, num_heads, num_encoder_layers)

In [4]:
dbpedia_data = DBPediaDataset('/Users/hengchu/Downloads/cis700data/joinedlonabstract_en.nt', max_seq_len=max_seq_len)

100%|██████████| 1384619/1384619 [00:09<00:00, 148528.73it/s]
100%|██████████| 1384619/1384619 [12:08<00:00, 1901.85it/s]


In [5]:
train_portion = 0.8
validation_portion = 0.1

train_size = int(len(dbpedia_data) * train_portion)
validation_size = int(len(dbpedia_data) * validation_portion)
test_size = len(dbpedia_data) - train_size - validation_size

train_set, validation_set, test_set = \
  torch.utils.data.random_split(dbpedia_data, [train_size, validation_size, test_size])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size)

In [6]:
# gather number of categories
num_fine_classes = dbpedia_data.num_fine_cats()
num_coarse_classes = dbpedia_data.num_coarse_cats()

In [8]:
device = torch.device('cuda:0')

model = Classifier(vocab_size, dim_embedding, num_heads,
                   ff_num_features, num_encoder_layers, 
                   max_seq_len, num_fine_classes)
initialize_model(model)
model = model.cuda()
optimizer = Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
loss_fun = nn.CrossEntropyLoss()
tb_writer = SummaryWriter('./logs')
epochs = 10

step  = 0
epoch = 0

for i in range(epochs):
  start_time = time.time()
  loss_val, validation_score, train_score = None, None, None
  for i_batch, batch_data in enumerate(train_loader):
    ids = batch_data[0].type(torch.LongTensor)
    masks = batch_data[1]
    fine_cats = batch_data[2]
    loss_val, train_score = utils.transformer_train(model, ids.cuda(), masks.cuda(), fine_cats.cuda(), loss_fun, optimizer)
    tb_writer.add_scalar('%s/loss_val' % network_name, loss_val, global_step=step)
    tb_writer.add_scalar('%s/train_acc' % network_name, train_score, global_step=step)
    step += 1
    if step % 5000 == 0:
      save_model(model, network_name, epoch, step)
  validation_score = utils.transformer_validate(model, validation_loader, 2, device)
  tb_writer.add_scalar('%s/validation_acc' % network_name, validation_score, global_step=step)
  end_time = time.time()
  epoch = i
  print('Epoch[%d] took %f seconds' % (epoch, end_time - start_time))
  print('train = %f, validation = %f, loss = %f' % (train_score, validation_score, loss_val))

Epoch[0] took 3039.741480 seconds
train = 0.266667, validation = 0.372993, loss = 2.936191
Epoch[1] took 3021.754335 seconds
train = 0.400000, validation = 0.402705, loss = 2.192239
Epoch[2] took 3022.240774 seconds
train = 0.488889, validation = 0.416861, loss = 2.177934
Epoch[3] took 3020.754504 seconds
train = 0.511111, validation = 0.430367, loss = 2.063056
Epoch[4] took 3020.989862 seconds
train = 0.488889, validation = 0.431053, loss = 2.216483
Epoch[5] took 3020.094303 seconds
train = 0.733333, validation = 0.433537, loss = 1.485769
Epoch[6] took 3020.544596 seconds
train = 0.600000, validation = 0.435661, loss = 1.564788
Epoch[7] took 3019.780324 seconds
train = 0.711111, validation = 0.434606, loss = 1.338268
Epoch[8] took 3019.645545 seconds
train = 0.733333, validation = 0.435083, loss = 1.493338
Epoch[9] took 3020.677503 seconds
train = 0.577778, validation = 0.432555, loss = 1.768355


In [9]:
save_model(model, network_name, epoch, step)

### Experiment 2 ((Baseline on f

In [12]:
# configurations
max_seq_len        = 256
dim_embedding      = 50
num_heads          = 5
num_encoder_layers = 6
ff_num_features    = 1024
vocab_size         = count_lines('cis700/vocab/bert-base-uncased-vocab.txt')
batch_size         = 50

network_name = 'transformer-coarse-2-s%d-e%d-h%d-l%d' % (max_seq_len, dim_embedding, num_heads, num_encoder_layers)

In [5]:
dbpedia_data = DBPediaDataset('/Users/hengchu/Downloads/cis700data/joinedlonabstract_en.nt', max_seq_len=max_seq_len)

100%|██████████| 1384619/1384619 [00:10<00:00, 135355.49it/s]
100%|██████████| 1384619/1384619 [12:02<00:00, 1916.24it/s]


In [6]:
train_portion = 0.8
validation_portion = 0.1

train_size = int(len(dbpedia_data) * train_portion)
validation_size = int(len(dbpedia_data) * validation_portion)
test_size = len(dbpedia_data) - train_size - validation_size

train_set, validation_set, test_set = \
  torch.utils.data.random_split(dbpedia_data, [train_size, validation_size, test_size])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size)

In [7]:
# gather number of categories
num_fine_classes = dbpedia_data.num_fine_cats()
num_coarse_classes = dbpedia_data.num_coarse_cats()

In [13]:
device = torch.device('cuda:0')

model = Classifier(vocab_size, dim_embedding, num_heads,
                   ff_num_features, num_encoder_layers, 
                   max_seq_len, num_coarse_classes)
initialize_model(model)
model = model.cuda()
optimizer = Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
loss_fun = nn.CrossEntropyLoss()
tb_writer = SummaryWriter('./logs')
epochs = 10

step  = 0
epoch = 0

for i in range(epochs):
  start_time = time.time()
  loss_val, validation_score, train_score = None, None, None
  for i_batch, batch_data in enumerate(train_loader):
    ids = batch_data[0].type(torch.LongTensor)
    masks = batch_data[1]
    coarse_cats = batch_data[3]
    loss_val, train_score = utils.transformer_train(model, ids.cuda(), masks.cuda(), coarse_cats.cuda(), loss_fun, optimizer)
    tb_writer.add_scalar('%s/loss_val' % network_name, loss_val, global_step=step)
    tb_writer.add_scalar('%s/train_acc' % network_name, train_score, global_step=step)
    step += 1
    if step % 5000 == 0:
      save_model(model, network_name, epoch, step)
  validation_score = utils.transformer_validate(model, validation_loader, 3, device)
  tb_writer.add_scalar('%s/validation_acc' % network_name, validation_score, global_step=step)
  end_time = time.time()
  epoch = i
  print('Epoch[%d] took %f seconds' % (epoch, end_time - start_time))
  print('train = %f, validation = %f, loss = %f' % (train_score, validation_score, loss_val))

Epoch[0] took 2919.140594 seconds
train = 0.355556, validation = 0.382851, loss = 2.669782
Epoch[1] took 2914.652169 seconds
train = 0.488889, validation = 0.412867, loss = 2.143310
Epoch[2] took 2914.346581 seconds
train = 0.488889, validation = 0.425087, loss = 2.048670
Epoch[3] took 2912.895249 seconds
train = 0.466667, validation = 0.433349, loss = 1.624332
Epoch[4] took 2913.289681 seconds
train = 0.488889, validation = 0.439763, loss = 1.862073
Epoch[5] took 2915.485798 seconds
train = 0.466667, validation = 0.442630, loss = 1.840591
Epoch[6] took 2913.253220 seconds
train = 0.644444, validation = 0.443396, loss = 1.529210
Epoch[7] took 2908.874687 seconds
train = 0.644444, validation = 0.443338, loss = 1.066650
Epoch[8] took 2908.982422 seconds
train = 0.444444, validation = 0.443908, loss = 2.040223
Epoch[9] took 2909.085295 seconds
train = 0.577778, validation = 0.442226, loss = 1.453888


In [14]:
save_model(model, network_name, epoch, step)

### Bootstrap experiment

In [4]:
# configurations
max_seq_len        = 256
dim_embedding      = 50
num_heads          = 5
num_encoder_layers = 6
ff_num_features    = 1024
vocab_size         = count_lines('cis700/vocab/bert-base-uncased-vocab.txt')
batch_size         = 50

network_name = 'transformer-bootstrap2-s%d-e%d-h%d-l%d' % (max_seq_len, dim_embedding, num_heads, num_encoder_layers)

In [5]:
dbpedia_data = DBPediaDataset('/Users/hengchu/Downloads/cis700data/joinedlonabstract_en.nt', max_seq_len=max_seq_len)

100%|██████████| 1384619/1384619 [00:09<00:00, 142777.82it/s]
100%|██████████| 1384619/1384619 [12:58<00:00, 1779.17it/s]


In [6]:
train_portion = 0.8
validation_portion = 0.1

train_size = int(len(dbpedia_data) * train_portion)
validation_size = int(len(dbpedia_data) * validation_portion)
test_size = len(dbpedia_data) - train_size - validation_size

train_set, validation_set, test_set = \
  torch.utils.data.random_split(dbpedia_data, [train_size, validation_size, test_size])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size)

In [7]:
# gather number of categories
num_fine_classes = dbpedia_data.num_fine_cats()
num_coarse_classes = dbpedia_data.num_coarse_cats()

In [8]:
device = torch.device('cuda:0')

model = Classifier(vocab_size, dim_embedding, num_heads,
                   ff_num_features, num_encoder_layers, 
                   max_seq_len, num_coarse_classes)
initialize_model(model)
model = model.cuda()
optimizer = Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
loss_fun = nn.CrossEntropyLoss()
tb_writer = SummaryWriter('./logs')
coarse_epochs = 2
fine_epochs = 10

step  = 0
epoch = 0

# train coarse_epochs on coarse labels
for i in range(coarse_epochs):
  start_time = time.time()
  loss_val, validation_score, train_score = None, None, None
  for i_batch, batch_data in enumerate(train_loader):
    ids = batch_data[0].type(torch.LongTensor)
    masks = batch_data[1]
    coarse_cats = batch_data[3]
    loss_val, train_score = utils.transformer_train(model, ids.cuda(), masks.cuda(), coarse_cats.cuda(), loss_fun, optimizer)
    tb_writer.add_scalar('%s/loss_val' % network_name, loss_val, global_step=step)
    tb_writer.add_scalar('%s/train_acc' % network_name, train_score, global_step=step)
    step += 1
    if step % 5000 == 0:
      save_model(model, network_name, epoch, step)
  validation_score = utils.transformer_validate(model, validation_loader, 3, device)
  tb_writer.add_scalar('%s/validation_acc' % network_name, validation_score, global_step=step)
  end_time = time.time()
  epoch = i
  print('Epoch[%d] took %f seconds' % (epoch, end_time - start_time))
  print('train = %f, validation = %f, loss = %f' % (train_score, validation_score, loss_val))
  
model = swap_linear_layer(model, num_fine_classes)

# then train fine_epochs on fine labels, compare accuracy increases to vanilla training on fine labels
for i in range(fine_epochs):
  start_time = time.time()
  loss_val, validation_score, train_score = None, None, None
  for i_batch, batch_data in enumerate(train_loader):
    ids = batch_data[0].type(torch.LongTensor)
    masks = batch_data[1]
    fine_cats = batch_data[2]
    loss_val, train_score = utils.transformer_train(model, ids.cuda(), masks.cuda(), fine_cats.cuda(), loss_fun, optimizer)
    tb_writer.add_scalar('%s/loss_val' % network_name, loss_val, global_step=step)
    tb_writer.add_scalar('%s/train_acc' % network_name, train_score, global_step=step)
    step += 1
    if step % 5000 == 0:
      save_model(model, network_name, epoch, step)
  validation_score = utils.transformer_validate(model, validation_loader, 2, device)
  tb_writer.add_scalar('%s/validation_acc' % network_name, validation_score, global_step=step)
  end_time = time.time()
  epoch = i
  print('Epoch[%d] took %f seconds' % (epoch, end_time - start_time))
  print('train = %f, validation = %f, loss = %f' % (train_score, validation_score, loss_val))

Epoch[0] took 3009.360830 seconds
train = 0.511111, validation = 0.386304, loss = 2.504602
Epoch[1] took 3004.730604 seconds
train = 0.288889, validation = 0.413633, loss = 2.671784
Epoch[0] took 3008.795308 seconds
train = 0.400000, validation = 0.393410, loss = 2.783735
Epoch[1] took 3007.895700 seconds
train = 0.422222, validation = 0.406815, loss = 2.259828
Epoch[2] took 3008.049983 seconds
train = 0.422222, validation = 0.420718, loss = 2.710992
Epoch[3] took 3006.731926 seconds
train = 0.377778, validation = 0.426106, loss = 2.422018
Epoch[4] took 3007.334610 seconds
train = 0.400000, validation = 0.429457, loss = 2.677648
Epoch[5] took 3006.229599 seconds
train = 0.400000, validation = 0.431089, loss = 2.263628
Epoch[6] took 3005.469187 seconds
train = 0.511111, validation = 0.434823, loss = 2.273735
Epoch[7] took 3005.929760 seconds
train = 0.533333, validation = 0.436267, loss = 2.120381
Epoch[8] took 3004.892882 seconds
train = 0.488889, validation = 0.436997, loss = 2.239928

In [9]:
save_model(model, network_name, epoch, step)

### Sanity check (please ignore)

In [9]:
try:
  if model:
    del model
except:
  pass

model = Classifier(vocab_size, dim_embedding, num_heads, ff_num_features, num_encoder_layers, max_seq_len, num_fine_classes)
model = model.cuda()
_, first_batch = next(enumerate(train_loader))
ids = first_batch[0].type(torch.LongTensor).cuda()
masks = first_batch[1].cuda()
r = model(ids, masks)
print(r)
print(r.size())
del model

RuntimeError: CUDA error: device-side assert triggered

### Scratch cells (please ignore)

In [9]:
importlib.reload(utils)

<module 'cis700.utils' from '/Users/hengchu/Documents/fun/cis700project/cis700/utils.py'>

In [12]:
del model

In [11]:
save_model(model, network_name, epoch, step)

In [10]:
utils.transformer_validate(model, validation_loader, 3, device)

KeyboardInterrupt: 

In [20]:
loss_val

tensor(2.0368, device='cuda:0', grad_fn=<NllLossBackward>)

In [21]:
validation_score

In [22]:
test_score

tensor(0.4496, device='cuda:0')

In [11]:
# find cases where the network guesses wrong

go = True
while go:
  idx = random.randint(0, len(dbpedia_data))
  features = dbpedia_data.get_feature(idx)
  top_cats = human_readable_prediction(dbpedia_data.get_feature(idx).text,
                                       model,
                                       max_seq_len,
                                       lambda id: dbpedia_data.fine_id2cat(id))
  if features.fine_cat_text in top_cats:
    continue

  print('>>> raw text')
  print(features.text)
  print('\n')
  print('>>> coarse cat:')
  print(features.coarse_cat_text)
  print('\n')
  print('>>> fine cat:')
  print(features.fine_cat_text)
  print('\n')
  print('>>> top predictions: ')
  for c in top_cats:
    print(c)
    
  break

>>> raw text
In fencing, an attack is the first offensive movement of a phrase.
>>> coarse cat:
<http://dbpedia.org/resource/Category:Behavior>
>>> fine cat:
<http://dbpedia.org/resource/Category:Conflict>
>>> top predictions: 
<http://dbpedia.org/resource/Category:Humans>
<http://dbpedia.org/resource/Category:Society>
<http://dbpedia.org/resource/Category:Government>
<http://dbpedia.org/resource/Category:Social_institutions>
<http://dbpedia.org/resource/Category:Politics>


In [17]:
  idx = random.randint(0, len(dbpedia_data))
  features = dbpedia_data.get_feature(idx)
  top_cats = human_readable_prediction(dbpedia_data.get_feature(idx).text,
                                       model,
                                       max_seq_len,
                                       lambda id: dbpedia_data.fine_id2cat(id))

  print('>>> raw text')
  print(features.text)
  print('')
  print('>>> coarse cat:')
  print(features.coarse_cat_text)
  print('')
  print('>>> fine cat:')
  print(features.fine_cat_text)
  print('')
  print('>>> top predictions: ')
  print('')
  for c in top_cats:
    print(c)


>>> raw text
In metadata, property equivalence is the statement that two properties have the same property extension or values. This usually (but not always) implies that the two properties have the same semantics or meaning. Technically it only implies that the data elements have the same values. Property equivalence is one of the three ways that a metadata registry can store equivalence mappings to other metadata registries. Note that property equivalence is not the same as property equality. Equivalent properties have the same \"values\", but may have different intensional meaning (i.e., denote different concepts). Property equality should be expressed with the owl:sameAs construct. As this requires that properties are treated as individuals, such axioms are only allowed in OWL Full.

>>> coarse cat:
<http://dbpedia.org/resource/Category:Technology>

>>> fine cat:
<http://dbpedia.org/resource/Category:Information_technology>

>>> top predictions: 

<http://dbpedia.org/resource/Categ

In [24]:
text = 'The mighty and terrible god of internet typos. The mysterious and all-knowing overlord of the Twitter trolls, and Pepe is his prophet. On this the 30th day of May, he was summoned by the orange king to begin his reign of terror over the lawless wasteland of cyberspace. And it shall be that no blogger, no profile, no Twitter egg shall know peace in the hereafter until they have bowed in reverence and fear at the glorious might of Covfefe.'
top_cats = human_readable_prediction(text,
                                     model,
                                     max_seq_len,
                                     lambda id: dbpedia_data.fine_id2cat(id))
print(text)
print('')
for c in top_cats:
  print(c)

The mighty and terrible god of internet typos. The mysterious and all-knowing overlord of the Twitter trolls, and Pepe is his prophet. On this the 30th day of May, he was summoned by the orange king to begin his reign of terror over the lawless wasteland of cyberspace. And it shall be that no blogger, no profile, no Twitter egg shall know peace in the hereafter until they have bowed in reverence and fear at the glorious might of Covfefe.

<http://dbpedia.org/resource/Category:Categories_for_renaming>
<http://dbpedia.org/resource/Category:Books_by_genre>
<http://dbpedia.org/resource/Category:Categories_by_year>
<http://dbpedia.org/resource/Category:Information>
<http://dbpedia.org/resource/Category:Literature>


In [90]:
dbpedia_data.get_feature(0)

{'text': "-30- (released as Deadline Midnight in the UK) is a 1959 movie starring William Conrad and Jack Webb as the editor and publisher, respectively, of a fictional Los Angeles evening newspaper. As the shift of a typical day starts, in which they don't know what will happen, the newspaper is created before our eyes as different stories are discovered and reported.", 'ids': [1011, 2382, 1011, 1006, 2207, 2004, 15117, 7090, 1999, 1996, 2866, 1007, 2003, 1037, 3851, 3185, 4626, 2520, 10931, 1998, 2990, 10923, 2004, 1996, 3559, 1998, 6674, 1010, 4414, 1010, 1997, 1037, 7214, 3050, 3349, 3944, 3780, 1012, 2004, 1996, 5670, 1997, 1037, 5171, 2154, 4627, 1010, 1999, 2029, 2027, 2123, 1005, 1056, 2113, 2054, 2097, 4148, 1010, 1996, 3780, 2003, 2580, 2077, 2256, 2159, 2004, 2367, 3441, 2024, 3603, 1998, 2988, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [24]:
model.cuda()

RuntimeError: CUDA error: device-side assert triggered

In [31]:
torch.cuda.synchronize()

RuntimeError: cuda runtime error (59) : device-side assert triggered at /Users/hengchu/Documents/work/pytorch/torch/csrc/cuda/Module.cpp:195

In [11]:
model.max_seq_len

256

In [12]:
nn.Linear(100, 10).cuda()

Linear(in_features=100, out_features=10, bias=True)

In [14]:
swap_linear_layer(model, num_coarse_classes)

Classifier(
  (encoder): Encoder(
    (embed): WordEmbedding(
      (embed): Embedding(30522, 50)
    )
    (pe): PositionEncoding()
    (encoder_layers): ModuleList(
      (0): EncoderLayer(
        (attention): MultiheadAttention(
          (q_linear): Linear(in_features=50, out_features=50, bias=True)
          (v_linear): Linear(in_features=50, out_features=50, bias=True)
          (k_linear): Linear(in_features=50, out_features=50, bias=True)
          (dropout): Dropout(p=0.1)
          (out): Linear(in_features=50, out_features=50, bias=True)
        )
        (norm1): Normalization()
        (ff): FeedForward(
          (fc1): Linear(in_features=50, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1)
          (fc2): Linear(in_features=1024, out_features=50, bias=True)
        )
        (norm2): Normalization()
        (drop1): Dropout(p=0.1)
        (drop2): Dropout(p=0.1)
      )
      (1): EncoderLayer(
        (attention): MultiheadAttention(
          (q_line

In [8]:
model = Classifier(vocab_size, dim_embedding, num_heads,
                   ff_num_features, num_encoder_layers, 
                   max_seq_len, num_fine_classes)
model = model.cuda()

In [9]:
model.load_state_dict(torch.load('./checkpoints/transformer-bootstrap2-s256-e50-h5-l6.epoch-9.step-265848.pth'))