In [1]:
#Get dataset files, load dependencies, setup tensorboard, define helper functions

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
  

!pip3 install tensorboardX
!git clone https://github.com/huggingface/pytorch-pretrained-BERT/
  

import sys
sys.path.insert(0, './cis700project')
sys.path.insert(0, './pytorch-pretrained-BERT')
import torch
import math
import subprocess
import re
import linecache
import torch.utils.data as data
import time
import importlib
import random
from tqdm import tqdm
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.autograd import Variable
from torch.optim import Adam
from tensorboardX import SummaryWriter
from pytorch_pretrained_bert.tokenization import BertTokenizer
from multiprocessing import Pool
from cis700 import utils

def build_tokenizer():
    tokenizer = BertTokenizer('./cis700project/cis700/vocab/bert-base-uncased-vocab.txt')
    return tokenizer

def category_text_to_id(cat_map, cat_text, count):
  if cat_text in cat_map:
    return cat_map[cat_text], count
  else:
    cat_map[cat_text] = count
    return cat_map[cat_text], count+1

def count_lines(filepath):
  r = subprocess.Popen(['wc', '-l', filepath], stdout=subprocess.PIPE)
  r = r.communicate()
  output = r[0].decode('utf-8')
  output = output.strip(' ').split(' ')[0]
  return int(output)

# this needs to be global for multiprocessing purpose
_tokenizer   = None
_max_seq_len = None

class Feature:
  def __init__(self, text, ids, masks, fine_cat, coarse_cat, fine_cat_text, coarse_cat_text):
    self.text = text
    self.ids = ids
    self.masks = masks
    self.fine_cat = fine_cat
    self.coarse_cat = coarse_cat
    self.fine_cat_text = fine_cat_text
    self.coarse_cat_text = coarse_cat_text

  def __repr__(self):
    return self.__dict__.__repr__()

def _truncate(ids):
  if len(ids) > _max_seq_len:
    return ids[0:_max_seq_len]
  return ids

def _process(datum):
  tokens = _tokenizer.tokenize(datum[0])
  ids = _tokenizer.convert_tokens_to_ids(tokens)
  ids = _truncate(ids)
  masks = [1] * len(ids)
  while len(ids) < _max_seq_len:
    ids.append(0)
    masks.append(0)
  return Feature(datum[0], ids, masks, datum[1], datum[2], datum[3], datum[4])

def convert_to_features(data_list, max_seq_len):
  global _tokenizer
  global _max_seq_len

  _tokenizer = build_tokenizer()
  _max_seq_len = max_seq_len

  with Pool(processes=6) as pool:
    return list(pool.imap(_process, tqdm(data_list)))

  _tokenizer = None
  _max_seq_len = None
  
class DBPediaDataset(Dataset):
  def __init__(self, filepath, max_seq_len):
    self.fine_cat_map = {}
    fine_cat_count = 0
    self.coarse_cat_map = {}
    coarse_cat_count = 0
    self.data = []

    text_re = r'(".+"@en)'
    cat_re = r'\. (<[^<>]+>) (<[^<>]+>)$'
    tok = build_tokenizer()

    with open(filepath, 'r') as f:
      for line in tqdm(f, total=count_lines(filepath)):
        text_match = re.search(text_re, line)
        text = text_match.group(1).strip('"@en')
        cat_match = re.search(cat_re, line)
        fine_cat = cat_match.group(1)
        coarse_cat = cat_match.group(2)
        fine_cat_id, fine_cat_count = category_text_to_id(self.fine_cat_map, fine_cat, fine_cat_count)
        coarse_cat_id, coarse_cat_count = category_text_to_id(self.coarse_cat_map, coarse_cat, coarse_cat_count)
        self.data.append((text, fine_cat_id, coarse_cat_id, fine_cat, coarse_cat))

    self.reverse_fine_cat_map = {}
    self.reverse_coarse_cat_map = {}
    for k in self.fine_cat_map:
      v = self.fine_cat_map[k]
      assert v not in self.reverse_fine_cat_map
      self.reverse_fine_cat_map[v] = k
    for k in self.coarse_cat_map:
      v = self.coarse_cat_map[k]
      assert v not in self.reverse_coarse_cat_map
      self.reverse_coarse_cat_map[v] = k

    self.data = convert_to_features(self.data, max_seq_len)

  def fine_id2cat(self, id):
    return self.reverse_fine_cat_map[id]

  def coarse_id2cat(self, id):
    return self.reverse_coarse_cat_map[id]

  def num_fine_cats(self):
    return len(self.reverse_fine_cat_map)

  def num_coarse_cats(self):
    return len(self.reverse_coarse_cat_map)

  def __len__(self):
    return len(self.data)

  def get_feature(self, idx):
    return self.data[idx]

  def __getitem__(self, idx):
    dt = self.data[idx]
    return torch.Tensor(dt.ids), torch.Tensor(dt.masks), dt.fine_cat, dt.coarse_cat, idx
  
LOG_DIR = './logs'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

!if [ -f ngrok ] ; then echo "Ngrok already installed" ; else wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip > /dev/null 2>&1 && unzip ngrok-stable-linux-amd64.zip > /dev/null 2>&1 ; fi
  
  # Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
import tensorflow as tf
import numpy as np
import scipy.misc 
try:
    from StringIO import StringIO  # Python 2.7
except ImportError:
    from io import BytesIO         # Python 3.x


class Logger(object):
    
    def __init__(self, log_dir):
        """Create a summary writer logging to log_dir."""
        self.writer = tf.summary.FileWriter(log_dir)

    def scalar_summary(self, tag, value, step):
        """Log a scalar variable."""
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
        self.writer.add_summary(summary, step)

    def image_summary(self, tag, images, step):
        """Log a list of images."""

        img_summaries = []
        for i, img in enumerate(images):
            # Write the image to a string
            try:
                s = StringIO()
            except:
                s = BytesIO()
            scipy.misc.toimage(img).save(s, format="png")

            # Create an Image object
            img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(),
                                       height=img.shape[0],
                                       width=img.shape[1])
            # Create a Summary value
            img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum))

        # Create and write Summary
        summary = tf.Summary(value=img_summaries)
        self.writer.add_summary(summary, step)
        
    def histo_summary(self, tag, values, step, bins=1000):
        """Log a histogram of the tensor of values."""

        # Create a histogram using numpy
        counts, bin_edges = np.histogram(values, bins=bins)

        # Fill the fields of the histogram proto
        hist = tf.HistogramProto()
        hist.min = float(np.min(values))
        hist.max = float(np.max(values))
        hist.num = int(np.prod(values.shape))
        hist.sum = float(np.sum(values))
        hist.sum_squares = float(np.sum(values**2))

        # Drop the start of the first bin
        bin_edges = bin_edges[1:]

        # Add bin edges and counts
        for edge in bin_edges:
            hist.bucket_limit.append(edge)
        for c in counts:
            hist.bucket.append(c)

        # Create and write Summary
        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
        self.writer.add_summary(summary, step)
        self.writer.flush()

## Required packages
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'
!pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1-cp36-cp36m-linux_x86_64.whl
!pip3 install torchvision
  
import torch
device =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/5c/76/89dd44458eb976347e5a6e75eb79fecf8facd46c1ce259bad54e0044ea35/tensorboardX-1.6-py2.py3-none-any.whl (129kB)
[K    100% |████████████████████████████████| 133kB 3.8MB/s 
Installing collected packages: tensorboardX
Successfully installed tensorboardX-1.6
Cloning into 'cis700project'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects: 100% (68/68), done.[

cuda:0


In [9]:
get_ipython().system_raw('./ngrok http 6006 &')
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print('Tensorboard Link: ' +str(json.load(sys.stdin)['tunnels'][0]['public_url']))"

Tensorboard Link: https://93e95796.ngrok.io


### Network Architecture

In [0]:
import os

class WordEmbedding(nn.Module):
  def __init__(self, vocab_size, dim_embedding):
    super(WordEmbedding, self).__init__()
    self.embed = nn.Embedding(vocab_size, dim_embedding)
  def forward(self, x):
    return self.embed(x)
  
class PositionEncoding(nn.Module):
  def __init__(self, dim_embedding, max_seq_len):
    super(PositionEncoding, self).__init__()
    self.dim_embedding = dim_embedding
    
    pe = torch.zeros(max_seq_len, dim_embedding)
    for pos in range(max_seq_len):
      for i in range(0, dim_embedding, 2):
        pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / dim_embedding)))
        pe[pos, i+1] = math.cos(pos / (10000 ** ((2 * (i+1)) / dim_embedding)))
        
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)
    
  def forward(self, x):
    x = x * math.sqrt(self.dim_embedding)
    x_len = x.size(1)
    x = x + Variable(self.pe[:,:x_len], requires_grad=False).cuda()
    return x
  
class MultiheadAttention(nn.Module):
  def __init__(self, num_heads, dim_embedding, dropout = 0.1):
    super(MultiheadAttention, self).__init__()
    
    self.dim_embedding = dim_embedding
    self.dim_k = dim_embedding / num_heads
    if int(self.dim_k) != self.dim_k:
      raise ValueError('num_heads should divide dim_embedding evenly! num_heads = %d, dim_embedding = %d' \
                       % (num_heads, dim_embedding))
    self.dim_k = int(self.dim_k)
    self.num_heads = num_heads
    
    self.q_linear = nn.Linear(dim_embedding, dim_embedding)
    self.v_linear = nn.Linear(dim_embedding, dim_embedding)
    self.k_linear = nn.Linear(dim_embedding, dim_embedding)
    
    self.dropout = nn.Dropout(dropout)
    
    self.out = nn.Linear(dim_embedding, dim_embedding)
    
  def attention(self, q, v, k, mask):
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.dim_k)
    # print('q.size = %s' % str(q.size()))
    # print('scores.size = %s' % str(scores.size()))
    mask = mask.unsqueeze(1).unsqueeze(1)
    scores = scores.masked_fill(mask == 0, -1e9)
    
    # print(scores.size())
    scores = F.softmax(scores, dim=-1)
    # print(scores.size())
    scores = self.dropout(scores)
    
    return torch.matmul(scores, v)
    
  def forward(self, q, v, k, mask):
    batch_size = q.size(0)
    
    q = self.q_linear(q).view(batch_size, -1, self.num_heads, self.dim_k)
    v = self.v_linear(v).view(batch_size, -1, self.num_heads, self.dim_k)
    k = self.k_linear(k).view(batch_size, -1, self.num_heads, self.dim_k)
    
    q = q.transpose(1, 2)
    v = v.transpose(1, 2)
    k = k.transpose(1, 2)
    
    scores = self.attention(q, v, k, mask)
    scores = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.dim_embedding)
    return scores
  
class FeedForward(nn.Module):
  def __init__(self, dim_embedding, num_features, dropout = 0.1):
    super(FeedForward, self).__init__()
    self.fc1 = nn.Linear(dim_embedding, num_features)
    self.dropout = nn.Dropout(dropout)
    self.fc2 = nn.Linear(num_features, dim_embedding)
    
  def forward(self, x):
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout(x)
    return self.fc2(x)
  
class Normalization(nn.Module):
  def __init__(self, dim_embedding, eps = 1e-6):
    super(Normalization, self).__init__()
    
    self.dim_embedding = dim_embedding
    self.alpha = nn.Parameter(torch.ones(self.dim_embedding))
    self.bias = nn.Parameter(torch.zeros(self.dim_embedding))
    self.eps = eps
    
  def forward(self, x):
    norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
    return norm
  
class EncoderLayer(nn.Module):
  def __init__(self, num_heads, dim_embedding, ff_num_features, dropout=0.1):
    super(EncoderLayer, self).__init__()
    self.attention = MultiheadAttention(num_heads, dim_embedding)
    self.norm1 = Normalization(dim_embedding)
    self.ff = FeedForward(dim_embedding, ff_num_features)
    self.norm2 = Normalization(dim_embedding)
    self.drop1 = nn.Dropout(dropout)
    self.drop2 = nn.Dropout(dropout)
    
  def forward(self, x, mask):
    x_ = self.norm1(x)
    x = x + self.drop1(self.attention(x_, x_, x_, mask))
    x_ = self.norm2(x)
    x = x + self.drop2(self.ff(x_))
    return x
  
class Encoder(nn.Module):
  def __init__(self, vocab_size, dim_embedding, num_heads, ff_num_features, num_encoder_layers, max_seq_len):
    super(Encoder, self).__init__()
    self.num_encoder_layers = num_encoder_layers
    self.embed = WordEmbedding(vocab_size, dim_embedding)
    self.pe = PositionEncoding(dim_embedding, max_seq_len)
    self.encoder_layers = \
      nn.ModuleList([EncoderLayer(num_heads, dim_embedding, ff_num_features) for _ in range(num_encoder_layers)])
    self.norm = Normalization(dim_embedding)
    
  def forward(self, x, mask):
    x = self.embed(x)
    x = self.pe(x)
    for i in range(self.num_encoder_layers):
      x = self.encoder_layers[i](x, mask)
    return self.norm(x)
  
class Classifier(nn.Module):
  def __init__(self, 
               vocab_size, dim_embedding, num_heads, 
               ff_num_features, num_encoder_layers, 
               max_seq_len, num_classes):
    super(Classifier, self).__init__()
    self.dim_embedding = dim_embedding
    self.max_seq_len = max_seq_len
    self.encoder = Encoder(vocab_size, dim_embedding, num_heads, ff_num_features, num_encoder_layers, max_seq_len)
    self.fc = nn.Linear(dim_embedding * max_seq_len, num_classes)
    
  def forward(self, x, mask):
    x = self.encoder(x, mask)
    return self.fc(x.view(-1, self.dim_embedding * self.max_seq_len))
  
def initialize_model(model):
  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)
      
def save_model(model, name, epoch, step):
  if not os.path.isdir("./checkpoints"):
    os.mkdir("./checkpoints")
  filename = './checkpoints/%s.epoch-%d.step-%d.pth' % (name, epoch, step)
  torch.save(model.state_dict(), filename)
  return filename

def human_readable_prediction(text, model, max_seq_len, cat_id2text_fun):
  tok = build_tokenizer()
  tokens = tok.tokenize(text)
  ids = tok.convert_tokens_to_ids(tokens)
  if len(ids) > max_seq_len:
    ids = ids[0:max_seq_len]
  masks = [1] * len(ids)
  ids += [0] * (max_seq_len - len(ids))
  masks += [0] * (max_seq_len - len(masks))
  
  model.eval()
  ids = torch.Tensor(ids).type(torch.LongTensor).cuda().unsqueeze(0)
  masks = torch.Tensor(masks).cuda().unsqueeze(0)
  scores = model(ids, masks).squeeze()
  top_k = 5
  _, max_cat_ids = torch.topk(scores, k = top_k, dim = 0)
  top_cats = [cat_id2text_fun(int(max_cat_ids[i])) for i in range(top_k)]
  return top_cats


### Dataset/Loader

In [4]:
subcategory_size = 0
subcategory2id = {}
with open('/content/drive/My Drive/supercatstats.csv', 'r') as f:
  # build stats
  for index,line in enumerate(f):
    subcategory2id[line.split(',')[0]]=subcategory_size
    subcategory_size += 1
    
print(subcategory_size)

category_size = 0
category2id = {}
with open('/content/drive/My Drive/catstats.csv', 'r') as f:
  # build stats
  for index,line in enumerate(f):
    category2id[line.split(',')[0]]=category_size
    category_size += 1
    
print(category_size)

tok = build_tokenizer()

class AbstractDataset(data.Dataset):
  """Abstract dataset."""

  def __init__(self, file = "joinedlonabstract_en.nt", root_dir = "/content/drive/My Drive"):
    """
    Args:
      csv_file (string): Path to the nt file.
      root_dir (string): Directory with all the images.
    """
    temp = linecache.getline(root_dir + '/' + file, 1)
    self.fullfile = open(root_dir + '/' + file).readlines()
    self.fullfile_len = len(self.fullfile)
    self.root_dir = root_dir
    self.category2id = category2id
    self.subcategory2id = subcategory2id

  def __len__(self):
    return self.fullfile_len

  def __getitem__(self, index):
    #line = linecache.getline(self.root_dir + '/' + self.file, index + 1)
    line = self.fullfile[index]
    match = re.findall(r'(<http://dbpedia.org/[^<]+)', line)
    category = match[2].replace('<http://dbpedia.org/resource/Category:','').replace('>','').replace('\n','').strip()
    subcategory = match[3].replace('<http://dbpedia.org/resource/Category:','').replace('>','').replace('\n','').strip()
    abstract = match[1].replace('<http://dbpedia.org/property/abstract> "','').replace('"@en .',"").replace('\n','').strip()
    tokens = tok.tokenize(abstract)
    if len(tokens) > 256:
      tokens = tokens[:256]
    ids = tok.convert_tokens_to_ids(tokens)
    #print(ids)
    #print(self.category2id[category])
    return torch.Tensor(ids), torch.tensor(self.subcategory2id[subcategory]), torch.tensor(self.category2id[category]) 

  
def collate_fn(data): 
    #Adapted from https://github.com/yunjey/seq2seq-dataloader/blob/master/data_loader.py
    """Creates mini-batch tensors from the list of tuples (src_seq, labels).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of tuple (src_seq, labels).
            - src_seq: torch tensor of shape (?); variable length.
            - subcategory: non-one-hot encoded labels
            - category
    Returns:
        src_seqs: torch tensor of shape (batch_size, padded_length).
        src_lengths: list of length (batch_size); valid length for each padded source sequence.
    """
    def merge(sequences):
        lengths = [len(seq) for seq in sequences]
        padded_seqs = torch.zeros(len(sequences), 256).long()
        mask = torch.zeros(len(sequences), 256).long()
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq[:end]
            mask[i,:end] = 1
        return padded_seqs, mask

    # sort a list by sequence length (descending order) to use pack_padded_sequence
    data.sort(key=lambda x: len(x[0]), reverse=True)

    # seperate source and target sequences
    src_seqs,subcategory, category = zip(*data)

    # merge sequences (from tuple of 1D tensor to 2D tensor)
    src_seqs, src_masks = merge(src_seqs)

    return src_seqs, src_masks,torch.stack(subcategory),torch.stack(category)

180
370


In [5]:
#Sanity check
text_dataset = AbstractDataset()
print(text_dataset[0])
loader_train = torch.utils.data.DataLoader(text_dataset,shuffle=False, batch_size=1, collate_fn=collate_fn,num_workers=3,pin_memory=True)
data_iter = iter(loader_train)
a,b,c,d = next(data_iter)
print(a)
print(b)

(tensor([ 1011.,  2382.,  1011.,  1006.,  2207.,  2004., 15117.,  7090.,  1999.,
         1996.,  2866.,  1007.,  2003.,  1037.,  3851.,  3185.,  4626.,  2520.,
        10931.,  1998.,  2990., 10923.,  2004.,  1996.,  3559.,  1998.,  6674.,
         1010.,  4414.,  1010.,  1997.,  1037.,  7214.,  3050.,  3349.,  3944.,
         3780.,  1012.,  2004.,  1996.,  5670.,  1997.,  1037.,  5171.,  2154.,
         4627.,  1010.,  1999.,  2029.,  2027.,  2123.,  1005.,  1056.,  2113.,
         2054.,  2097.,  4148.,  1010.,  1996.,  3780.,  2003.,  2580.,  2077.,
         2256.,  2159.,  2004.,  2367.,  3441.,  2024.,  3603.,  1998.,  2988.,
         1012.]), tensor(0), tensor(0))
tensor([[ 1011,  2382,  1011,  1006,  2207,  2004, 15117,  7090,  1999,  1996,
          2866,  1007,  2003,  1037,  3851,  3185,  4626,  2520, 10931,  1998,
          2990, 10923,  2004,  1996,  3559,  1998,  6674,  1010,  4414,  1010,
          1997,  1037,  7214,  3050,  3349,  3944,  3780,  1012,  2004,  1996,
   

### Train base model

In [6]:
import time, datetime
import torch.nn as nn
from google.colab import files

#Hyperparameters
max_seq_len = 256
dim_embedding = 50
num_heads = 5
num_encoder_layers = 6
ff_num_features = 1024
vocab_size = count_lines('./cis700project/cis700/vocab/bert-base-uncased-vocab.txt')
batch_size = 128
num_epochs = 10
lr = 1e-3

network_name = 'transformer-s%d-e%d-h%d-l%d' % (max_seq_len, dim_embedding, num_heads, num_encoder_layers)


now = time.mktime(datetime.datetime.now().timetuple())
logger = Logger(f'./logs/Transformer_{now}/')
logger_val = Logger(f'./logs/Transformer_eval_{now}/')

# gather number of categories
num_fine_classes = category_size
num_coarse_classes = subcategory_size

#Seed
torch.manual_seed(0)

text_dataset = AbstractDataset()
lengths = [int(len(text_dataset)*0.8), int(len(text_dataset)*0.1), int(len(text_dataset)) - 
           int(len(text_dataset)*0.8) - int(len(text_dataset)*0.1)]
text_dataset_train, text_dataset_val, text_dataset_test = torch.utils.data.random_split(text_dataset, lengths)

loader_train = torch.utils.data.DataLoader(text_dataset_train,shuffle=True, batch_size=batch_size, collate_fn=collate_fn,num_workers=3,pin_memory=True)
loader_test = torch.utils.data.DataLoader(text_dataset_test,shuffle=True, batch_size=batch_size, collate_fn=collate_fn,num_workers=3,pin_memory=True)
torch.backends.cudnn.enabled = True

model = Classifier(vocab_size, dim_embedding, num_heads,
                   ff_num_features, num_encoder_layers, 
                   max_seq_len, num_fine_classes)
initialize_model(model)
model = model.cuda()

#Loss and optimizer
loss_fun = torch.nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

def validate(num_train_steps):     
  #Get validation data
  rand_sampler = torch.utils.data.RandomSampler(text_dataset_val, num_samples=batch_size, replacement=True)
  loader_val = torch.utils.data.DataLoader(text_dataset_val, batch_size=batch_size, collate_fn=collate_fn,num_workers=3,pin_memory=True, sampler=rand_sampler)
  get_data_step = iter(loader_val)
  model.eval()
  count_correct = 0
  count_total = 0
  with torch.no_grad():
    abstract, mask, subcategory, category = next(get_data_step)
    abstract = abstract.to(device)
    subcategory = subcategory.to(device)
    category = category.to(device)
    mask = mask.to(device)
    
    result = model(abstract,mask)
    loss = loss_fun(result, category)
    _, argmax = torch.max(result, 1)
    count_total += category.size(0)
    count_correct += (argmax == category).sum().item()
    accuracy = (category == argmax).float().mean()
  
    #Tensorboard logging
    to_log = {'loss': loss.item(), 'accuracy': accuracy.item()}
    for handle, val in to_log.items():
      logger_val.scalar_summary(handle, val, num_train_steps+1)

  print('The validation accuracy is: %s%% [%s]' % (count_correct/count_total * 100,batch_size))  
  model.train()

def evaluate():     
  #Get testing data
  get_data_step = iter(loader_test)
  epoch_length = len(loader_test)

  model.eval()
  count_correct = 0
  count_total = 0
  with torch.no_grad():
    for test_step in range(epoch_length):
      abstract, mask, subcategory, category = next(get_data_step)

      abstract = abstract.to(device)
      subcategory = subcategory.to(device)
      category = category.to(device)
      mask = mask.to(device)

      result = model(abstract, mask)
      _, argmax = torch.max(result, 1)
      count_total += category.size(0)
      count_correct += (argmax == category).sum().item()

  print('The testing set accuracy is: %s%% [%s]' % (count_correct/count_total * 100,epoch_length * batch_size))  
  model.train()
  
#Training loop
model.train()
num_train_steps = 0
for epoch in range(num_epochs):
    #Get an epoch
    get_data_step = iter(loader_train)
    epoch_length = len(loader_train)
    for train_step in range(epoch_length):
      #start = time.time()
      num_train_steps += 1
      
      abstract, mask, subcategory, category = next(get_data_step)
      #print(abstract)
      abstract = abstract.to(device)
      subcategory = subcategory.to(device)
      category = category.to(device)
      mask = mask.to(device)
    
      # Do a forward pass
      #start = time.time()
      result = model(abstract, mask)
      #end = time.time()
      loss = loss_fun(result, category)
    
      # Now backpropagate
      optimizer.zero_grad()
      #start = time.time()
      loss.backward()
      #end = time.time()
      
      optimizer.step()

      # Find the accuracy
      _, argmax = torch.max(result,1)
      accuracy = (category == argmax).float().mean()
      
      #Print
      if (num_train_steps + 1) % 100 == 0: 
        print('Epoch: [% d/% d], Step: [% d/% d], Loss: %.4f, Accuracy: %4f'
          % (epoch + 1, num_epochs, num_train_steps, len(loader_train) * num_epochs, loss.item(), accuracy.item())) 
        validate(num_train_steps+1)
      #print(torch.cuda.memory_allocated(device))
      #print(torch.cuda.memory_cached(device))
      end = time.time()
      #print(end-start)
      del abstract, mask, subcategory, category, result, argmax
      torch.cuda.empty_cache()
      
      if num_train_steps % 1 == 0:  
        #Tensorboard logging
        to_log = {'loss': loss.item(), 'accuracy': accuracy.item()}
        for handle, val in to_log.items():
            logger.scalar_summary(handle, val, num_train_steps+1)
            
    evaluate()

Epoch: [ 1/ 10], Step: [ 99/ 86540], Loss: 6.1380, Accuracy: 0.039062
The validation accuracy is: 6.25% [128]
Epoch: [ 1/ 10], Step: [ 199/ 86540], Loss: 5.2292, Accuracy: 0.062500
The validation accuracy is: 12.5% [128]
Epoch: [ 1/ 10], Step: [ 299/ 86540], Loss: 4.5735, Accuracy: 0.195312
The validation accuracy is: 25.0% [128]
Epoch: [ 1/ 10], Step: [ 399/ 86540], Loss: 3.5834, Accuracy: 0.281250
The validation accuracy is: 28.90625% [128]
Epoch: [ 1/ 10], Step: [ 499/ 86540], Loss: 3.4190, Accuracy: 0.281250
The validation accuracy is: 29.6875% [128]
Epoch: [ 1/ 10], Step: [ 599/ 86540], Loss: 4.0454, Accuracy: 0.203125
The validation accuracy is: 27.34375% [128]
Epoch: [ 1/ 10], Step: [ 699/ 86540], Loss: 3.5594, Accuracy: 0.242188
The validation accuracy is: 24.21875% [128]
Epoch: [ 1/ 10], Step: [ 799/ 86540], Loss: 3.0185, Accuracy: 0.281250
The validation accuracy is: 32.03125% [128]
Epoch: [ 1/ 10], Step: [ 899/ 86540], Loss: 2.8580, Accuracy: 0.328125
The validation accuracy

KeyboardInterrupt: ignored

In [0]:
filename = save_model(model, network_name, epoch, num_train_steps)
files.download(filename) 


### Transfer leaning on traning data

In [8]:
from functools import reduce
from google.colab import files
import datetime
#files.upload()

# gather number of categories
num_fine_classes = category_size
num_coarse_classes = subcategory_size

#Seed
torch.manual_seed(0)

#Settings
max_seq_len = 256
dim_embedding = 50
num_heads = 5
num_encoder_layers = 6
ff_num_features = 1024
vocab_size = count_lines('./cis700project/cis700/vocab/bert-base-uncased-vocab.txt')
model = Classifier(vocab_size, dim_embedding, num_heads,
                   ff_num_features, num_encoder_layers, 
                   max_seq_len, num_fine_classes)
#checkpoint = torch.load('transformer-s256-e50-h5-l6.epoch-0.step-8654.pth')
checkpoint = torch.load('/content/drive/My Drive/transformer-fine-s256-e50-h5-l6.epoch-9.step-221540.pth')
model.load_state_dict(checkpoint)
model.eval()

total_params = sum([param.nelement() for param in model.parameters()])
train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Original model trainable params %s/%s" % (train_params, total_params))

new_model = model
for param in new_model.parameters():
    param.requires_grad = False

feature_cnt = new_model.fc.in_features
new_model.fc = torch.nn.Linear(feature_cnt, num_coarse_classes)
#print(num_coarse_classes)
total_params = sum([param.nelement() for param in new_model.parameters()])
train_params = sum(p.numel() for p in new_model.parameters() if p.requires_grad)
print("New model trainable params %s/%s" % (train_params, total_params))


#Hyperparameters
batch_size = 128
num_epochs = 5
lr = 1e-3

network_name = 'transformer-s%d-e%d-h%d-l%d' % (max_seq_len, dim_embedding, num_heads, num_encoder_layers)


now = time.mktime(datetime.datetime.now().timetuple())
logger = Logger(f'./logs/Transformer_transfer_{now}/')
logger_val = Logger(f'./logs/Transformer_transfer_eval_{now}/')

# gather number of categories
num_fine_classes = category_size
num_coarse_classes = subcategory_size

#Seed
torch.manual_seed(0)


text_dataset = AbstractDataset()
lengths = [int(len(text_dataset)*0.8), int(len(text_dataset)*0.1), int(len(text_dataset)) - 
           int(len(text_dataset)*0.8) - int(len(text_dataset)*0.1)]
text_dataset_train, text_dataset_val, text_dataset_test = torch.utils.data.random_split(text_dataset, lengths)

loader_train = torch.utils.data.DataLoader(text_dataset_train,shuffle=True, batch_size=batch_size, collate_fn=collate_fn,num_workers=3,pin_memory=True)
loader_test = torch.utils.data.DataLoader(text_dataset_test,shuffle=True, batch_size=batch_size, collate_fn=collate_fn,num_workers=3,pin_memory=True)
torch.backends.cudnn.enabled = True

new_model = new_model.cuda()

#Loss and optimizer
loss_fun = torch.nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


def validate(num_train_steps):     
  #Get validation data
  rand_sampler = torch.utils.data.RandomSampler(text_dataset_val, num_samples=batch_size, replacement=True)
  loader_val = torch.utils.data.DataLoader(text_dataset_val, batch_size=batch_size, collate_fn=collate_fn,num_workers=3,pin_memory=True, sampler=rand_sampler)
  get_data_step = iter(loader_val)
  model.eval()
  count_correct = 0
  count_total = 0
  with torch.no_grad():
    abstract, mask, subcategory, category = next(get_data_step)
    abstract = abstract.to(device)
    subcategory = subcategory.to(device)
    category = category.to(device)
    mask = mask.to(device)
    
    result = new_model(abstract,mask)
    loss = loss_fun(result, subcategory)
    _, argmax = torch.max(result, 1)
    count_total += subcategory.size(0)
    count_correct += (argmax == subcategory).sum().item()
    accuracy = (subcategory == argmax).float().mean()
  
    #Tensorboard logging
    to_log = {'loss': loss.item(), 'accuracy': accuracy.item()}
    for handle, val in to_log.items():
      logger_val.scalar_summary(handle, val, num_train_steps+1)

  print('The validation accuracy is: %s%% [%s]' % (count_correct/count_total * 100,batch_size))  
  model.train()

def evaluate():     
  #Get testing data
  get_data_step = iter(loader_test)
  epoch_length = len(loader_test)

  model.eval()
  count_correct = 0
  count_total = 0
  with torch.no_grad():
    for test_step in range(epoch_length):
      abstract, mask, subcategory, category = next(get_data_step)

      abstract = abstract.to(device)
      subcategory = subcategory.to(device)
      category = category.to(device)
      mask = mask.to(device)

      result = new_model(abstract, mask)
      _, argmax = torch.max(result, 1)
      count_total += subcategory.size(0)
      count_correct += (argmax == subcategory).sum().item()

  print('The testing set accuracy is: %s%% [%s]' % (count_correct/count_total * 100,epoch_length * batch_size))  
  model.train()

#Training loop
new_model.train()
num_train_steps = 0
for epoch in range(num_epochs):
    #Get an epoch
    get_data_step = iter(loader_train)
    epoch_length = len(loader_train)
    for train_step in range(epoch_length):
      #start = time.time()
      num_train_steps += 1
      
      abstract, mask, subcategory, category = next(get_data_step)
      #print(abstract)
      abstract = abstract.to(device)
      subcategory = subcategory.to(device)
      category = category.to(device)
      mask = mask.to(device)
    
      # Do a forward pass
      #start = time.time()
      result = new_model(abstract, mask)
      #end = time.time()
      loss = loss_fun(result, subcategory)
    
      # Now backpropagate
      optimizer.zero_grad()
      #start = time.time()
      loss.backward()
      #end = time.time()
      
      optimizer.step()

      # Find the accuracy
      _, argmax = torch.max(result,1)
      accuracy = (subcategory == argmax).float().mean()
      
      #Print
      if (num_train_steps + 1) % 100 == 0: 
        print('Epoch: [% d/% d], Step: [% d/% d], Loss: %.4f, Accuracy: %4f'
          % (epoch + 1, num_epochs, num_train_steps, len(loader_train) * num_epochs, loss.item(), accuracy.item())) 
        validate(num_train_steps+1)
      #print(torch.cuda.memory_allocated(device))
      #print(torch.cuda.memory_cached(device))
      end = time.time()
      #print(end-start)
      del abstract, mask, subcategory, category, result, argmax
      torch.cuda.empty_cache()
      
      if num_train_steps % 1 == 0:  
        #Tensorboard logging
        to_log = {'loss': loss.item(), 'accuracy': accuracy.item()}
        for handle, val in to_log.items():
            logger.scalar_summary(handle, val, num_train_steps+1)
            
    evaluate()

Original model trainable params 6945814/6945814
New model trainable params 2304180/4513624
Epoch: [ 1/ 5], Step: [ 99/ 43270], Loss: 2.4529, Accuracy: 0.367188
The validation accuracy is: 32.03125% [128]
Epoch: [ 1/ 5], Step: [ 199/ 43270], Loss: 2.2901, Accuracy: 0.406250
The validation accuracy is: 34.375% [128]
Epoch: [ 1/ 5], Step: [ 299/ 43270], Loss: 2.4178, Accuracy: 0.414062
The validation accuracy is: 38.28125% [128]
Epoch: [ 1/ 5], Step: [ 399/ 43270], Loss: 2.2191, Accuracy: 0.468750
The validation accuracy is: 44.53125% [128]
Epoch: [ 1/ 5], Step: [ 499/ 43270], Loss: 2.1496, Accuracy: 0.468750
The validation accuracy is: 32.8125% [128]
Epoch: [ 1/ 5], Step: [ 599/ 43270], Loss: 2.5263, Accuracy: 0.367188
The validation accuracy is: 32.8125% [128]
Epoch: [ 1/ 5], Step: [ 699/ 43270], Loss: 2.2607, Accuracy: 0.382812
The validation accuracy is: 46.875% [128]
Epoch: [ 1/ 5], Step: [ 799/ 43270], Loss: 2.1851, Accuracy: 0.476562
The validation accuracy is: 41.40625% [128]
Epoc

### Transfer learning on validation data

In [0]:
from functools import reduce
from google.colab import files
import datetime
#files.upload()

# gather number of categories
num_fine_classes = category_size
num_coarse_classes = subcategory_size

#Seed
torch.manual_seed(0)

#Settings
max_seq_len = 256
dim_embedding = 50
num_heads = 5
num_encoder_layers = 6
ff_num_features = 1024
vocab_size = count_lines('./cis700project/cis700/vocab/bert-base-uncased-vocab.txt')
model = Classifier(vocab_size, dim_embedding, num_heads,
                   ff_num_features, num_encoder_layers, 
                   max_seq_len, num_fine_classes)
#checkpoint = torch.load('transformer-s256-e50-h5-l6.epoch-0.step-8654.pth')
checkpoint = torch.load('/content/drive/My Drive/transformer-fine-s256-e50-h5-l6.epoch-9.step-221540.pth')
model.load_state_dict(checkpoint)
model.eval()

total_params = sum([param.nelement() for param in model.parameters()])
train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Original model trainable params %s/%s" % (train_params, total_params))

new_model = model
for param in new_model.parameters():
    param.requires_grad = False

feature_cnt = new_model.fc.in_features
new_model.fc = torch.nn.Linear(feature_cnt, num_coarse_classes)
#print(num_coarse_classes)
total_params = sum([param.nelement() for param in new_model.parameters()])
train_params = sum(p.numel() for p in new_model.parameters() if p.requires_grad)
print("New model trainable params %s/%s" % (train_params, total_params))


#Hyperparameters
batch_size = 128
num_epochs = 10
lr = 1e-4

network_name = 'transformer-s%d-e%d-h%d-l%d' % (max_seq_len, dim_embedding, num_heads, num_encoder_layers)


now = time.mktime(datetime.datetime.now().timetuple())
logger = Logger(f'./logs/Transformer_transfer_valset_{now}/')

# gather number of categories
num_fine_classes = category_size
num_coarse_classes = subcategory_size

#Seed
torch.manual_seed(0)


text_dataset = AbstractDataset()
lengths = [int(len(text_dataset)*0.8), int(len(text_dataset)*0.1), int(len(text_dataset)) - 
           int(len(text_dataset)*0.8) - int(len(text_dataset)*0.1)]
text_dataset_train, text_dataset_val, text_dataset_test = torch.utils.data.random_split(text_dataset, lengths)

#New seed
torch.manual_seed(99)

loader_train = torch.utils.data.DataLoader(text_dataset_val,shuffle=True, batch_size=batch_size, collate_fn=collate_fn,num_workers=3,pin_memory=True)
loader_test = torch.utils.data.DataLoader(text_dataset_test,shuffle=True, batch_size=batch_size, collate_fn=collate_fn,num_workers=3,pin_memory=True)
torch.backends.cudnn.enabled = True

new_model = new_model.cuda()

#Loss and optimizer
loss_fun = torch.nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


def evaluate():     
  #Get testing data
  get_data_step = iter(loader_test)
  epoch_length = len(loader_test)

  model.eval()
  count_correct = 0
  count_total = 0
  with torch.no_grad():
    for test_step in range(epoch_length):
      abstract, mask, subcategory, category = next(get_data_step)

      abstract = abstract.to(device)
      subcategory = subcategory.to(device)
      category = category.to(device)
      mask = mask.to(device)

      result = new_model(abstract, mask)
      _, argmax = torch.max(result, 1)
      count_total += subcategory.size(0)
      count_correct += (argmax == subcategory).sum().item()

  print('The testing set accuracy is: %s%% [%s]' % (count_correct/count_total * 100,epoch_length * batch_size))  
  model.train()

#Training loop
new_model.train()
num_train_steps = 0
for epoch in range(num_epochs):
    #Get an epoch
    get_data_step = iter(loader_train)
    epoch_length = len(loader_train)
    for train_step in range(epoch_length):
      #start = time.time()
      num_train_steps += 1
      
      abstract, mask, subcategory, category = next(get_data_step)
      #print(abstract)
      abstract = abstract.to(device)
      subcategory = subcategory.to(device)
      category = category.to(device)
      mask = mask.to(device)
    
      # Do a forward pass
      #start = time.time()
      result = new_model(abstract, mask)
      #end = time.time()
      loss = loss_fun(result, subcategory)
    
      # Now backpropagate
      optimizer.zero_grad()
      #start = time.time()
      loss.backward()
      #end = time.time()
      
      optimizer.step()

      # Find the accuracy
      _, argmax = torch.max(result,1)
      accuracy = (subcategory == argmax).float().mean()
      
      #Print
      if (num_train_steps + 1) % 100 == 0: 
        print('Epoch: [% d/% d], Step: [% d/% d], Loss: %.4f, Accuracy: %4f'
          % (epoch + 1, num_epochs, num_train_steps, len(loader_train) * num_epochs, loss.item(), accuracy.item())) 
      #print(torch.cuda.memory_allocated(device))
      #print(torch.cuda.memory_cached(device))
      end = time.time()
      #print(end-start)
      del abstract, mask, subcategory, category, result, argmax
      torch.cuda.empty_cache()
      
      if num_train_steps % 1 == 0:  
        #Tensorboard logging
        to_log = {'loss': loss.item(), 'accuracy': accuracy.item()}
        for handle, val in to_log.items():
            logger.scalar_summary(handle, val, num_train_steps+1)
            
    evaluate()

Original model trainable params 6945814/6945814
New model trainable params 2304180/4513624
Epoch: [ 1/ 10], Step: [ 99/ 10820], Loss: 3.0839, Accuracy: 0.335938
Epoch: [ 1/ 10], Step: [ 199/ 10820], Loss: 2.6236, Accuracy: 0.367188
Epoch: [ 1/ 10], Step: [ 299/ 10820], Loss: 2.3976, Accuracy: 0.382812
Epoch: [ 1/ 10], Step: [ 399/ 10820], Loss: 2.1420, Accuracy: 0.445312
Epoch: [ 1/ 10], Step: [ 499/ 10820], Loss: 2.1794, Accuracy: 0.406250
Epoch: [ 1/ 10], Step: [ 599/ 10820], Loss: 2.4794, Accuracy: 0.335938
Epoch: [ 1/ 10], Step: [ 699/ 10820], Loss: 2.2558, Accuracy: 0.398438
Epoch: [ 1/ 10], Step: [ 799/ 10820], Loss: 1.9127, Accuracy: 0.507812
Epoch: [ 1/ 10], Step: [ 899/ 10820], Loss: 2.2455, Accuracy: 0.414062
Epoch: [ 1/ 10], Step: [ 999/ 10820], Loss: 2.2568, Accuracy: 0.429688
The testing set accuracy is: 43.02737915544225% [138496]
Epoch: [ 2/ 10], Step: [ 1099/ 10820], Loss: 1.9046, Accuracy: 0.414062
Epoch: [ 2/ 10], Step: [ 1199/ 10820], Loss: 2.1155, Accuracy: 0.453125