Mostly re-using Nava's code to

1.   Download data from arxiv
2.   Tokenize using spacy
3.   Build data batches using pytorch

**Downloading data from arxiv**

In [2]:
!pip install feedparser

import urllib.request
import feedparser
import pandas as pd

Collecting feedparser
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[?25l[K     |████                            | 10 kB 31.8 MB/s eta 0:00:01[K     |████████                        | 20 kB 25.1 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 17.6 MB/s eta 0:00:01[K     |████████████████▏               | 40 kB 15.3 MB/s eta 0:00:01[K     |████████████████████▏           | 51 kB 7.0 MB/s eta 0:00:01[K     |████████████████████████▎       | 61 kB 8.2 MB/s eta 0:00:01[K     |████████████████████████████▎   | 71 kB 7.3 MB/s eta 0:00:01[K     |████████████████████████████████| 81 kB 5.1 MB/s 
[?25hCollecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6065 sha256=4527a092ae202b4a62e7c52e0a235065fe074b03d7de6ebfc469d4dd40cc7fb6
  Stored in directory

In [3]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
search_query = 'all:electron' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 10**3

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()

# parse the response using feedparser
feed = feedparser.parse(response)

#columns of interest
col=['title', 'summary', 'authors', 'arxiv_primary_category', 'tags']

# Run through each entry, and fill the information into a list
data_list=[]
for c in col:
	abstract_list=[]
	for entry in feed.entries:
		abstract_list.append(entry.get(c))
	data_list.append(abstract_list)
 
 
 # convert into a panda dataframe 
data_df = pd.DataFrame(data_list,index=col)
data_df=data_df.T

print(data_df)

                                                 title  ...                                               tags
0    Impact of Electron-Electron Cusp on Configurat...  ...  [{'term': 'cond-mat.str-el', 'scheme': 'http:/...
1    Electron thermal conductivity owing to collisi...  ...  [{'term': 'astro-ph', 'scheme': 'http://arxiv....
2    Electron pairing: from metastable electron pai...  ...  [{'term': 'cond-mat.str-el', 'scheme': 'http:/...
3    Electron Temperature Anisotropy and Electron B...  ...  [{'term': 'physics.space-ph', 'scheme': 'http:...
4    Hamiltonian of a many-electron system with sin...  ...  [{'term': 'cond-mat.supr-con', 'scheme': 'http...
..                                                 ...  ...                                                ...
995  On a mechanism of high-temperature superconduc...  ...  [{'term': 'cond-mat.supr-con', 'scheme': 'http...
996  Research on the secondary electron yield of ti...  ...  [{'term': 'physics.acc-ph', 'scheme': 'http://...
9

**Tokenize using spacy**

In [4]:
!python -m spacy download en_core_web_lg

import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180943 sha256=26b5e75336e206fce6a262afb035c5888400223338a15ee23ee14cc7615e55b2
  Stored in directory: /tmp/pip-ephem-wheel-cache-xh02hz7l/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [5]:
#taking titles
# using lower case. removing extra spaces and '\n ' 
doc=[nlp.tokenizer(text.lower().replace('\n ','').strip()) for text in data_list[0]]

In [6]:
## manually constructing vocabulary
class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    BOS_token = 1   # Beginning-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.num_words = 0
        self.num_sentences = 0
        self.longest_sentence = 0

        ## add PAD, BOS, EOS tokens:
        self.word2index['<PAD>'] = self.num_words
        self.word2count['<PAD>'] = 1
        self.index2word[self.num_words] = '<PAD>'
        self.num_words += 1

        self.word2index['<BOS>'] = self.num_words
        self.word2count['<BOS>'] = 1
        self.index2word[self.num_words] = '<BOS>'
        self.num_words += 1

        self.word2index['<EOS>'] = self.num_words
        self.word2count['<EOS>'] = 1
        self.index2word[self.num_words] = '<EOS>'
        self.num_words += 1

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 1 # length of sentence + <EOS> or <BOS>
        for word in [token.text for token in sentence]:
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [7]:
voc=Vocabulary('abstracts')
for sent in doc:
 	voc.add_sentence(sent)
  
Input_list=[]
for sample in range(len(doc)):
	Input_list.append([voc.to_index("<BOS>")]+[voc.to_index(token.text) for token in doc[sample]]+[voc.to_index("<EOS>")])

1000
[[[1, 3, 4, 5, 6, 5, 7, 8, 9, 10, 11, 2], [1, 5, 12, 13, 14, 15, 16, 17, 18, 19, 2], [1, 5, 20, 21, 22, 23, 5, 24, 15, 25, 2], [1, 5, 26, 27, 28, 5, 29, 30, 22, 5, 31, 32, 33, 34, 35, 36, 2], [1, 37, 4, 38, 39, 6, 5, 40, 41, 42, 6, 5, 28, 5, 6, 24, 43, 33, 38, 44, 6, 45, 46, 47, 2], [1, 5, 6, 5, 48, 49, 28, 34, 50, 4, 5, 51, 52, 33, 35, 53, 2], [1, 54, 55, 4, 56, 2], [1, 57, 5, 6, 20, 58, 43, 4, 59, 6, 60, 61, 41, 62, 63, 64, 2], [1, 65, 66, 34, 5, 6, 5, 10, 22, 67, 68, 69, 70, 2], [1, 5, 6, 5, 64, 33, 38, 71, 72, 44, 6, 45, 5, 40, 2], [1, 73, 4, 74, 6, 75, 76, 28, 77, 10, 33, 78, 6, 79, 5, 40, 2], [1, 80, 6, 5, 81, 4, 82, 83, 84, 85, 6, 86, 5, 6, 87, 88, 21, 38, 89, 6, 90, 91, 2], [1, 89, 6, 90, 70, 4, 92, 6, 5, 93, 33, 82, 2], [1, 94, 6, 95, 96, 74, 97, 98, 18, 19, 33, 34, 5, 6, 99, 100, 2], [1, 89, 6, 90, 70, 4, 101, 102, 4, 84, 85, 6, 103, 19, 33, 82, 2], [1, 104, 6, 105, 106, 107, 108, 109, 107, 28, 34, 110, 4, 5, 6, 5, 64, 2], [1, 111, 28, 112, 6, 113, 114, 4, 108, 43, 33, 3

**Building datasets**

In [22]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence

class CustomTextDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]

arxiv_dataset = CustomTextDataset(Input_list)
data_len = len(Input_list)
print(data_len)
train_list, validation_list, test_list = random_split(arxiv_dataset, [int(data_len*0.72), int(data_len*0.1), data_len-int(data_len*0.82)], generator=torch.Generator().manual_seed(42))

1000


In [None]:
def collate_batch(batch):
    label_list, text_list = [], []
    for _sample in batch:
        label_list.append(torch.tensor(_sample[:-1])) # data
        text_list.append(torch.tensor(_sample[1:])) # trg
    return pad_sequence(label_list, padding_value=0.0), pad_sequence(text_list, padding_value=0.0)

batch_size = 30

def create_iterators(batch_size=batch_size):
    """Heler function to create the iterators"""
    dataloaders = []
    for split in [train_list, validation_list, test_list]:
        dataloader = DataLoader(
            split, batch_size=batch_size,
            collate_fn=collate_batch
            )
        dataloaders.append(dataloader)
    return dataloaders

**Make model and train**

In [12]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from misc_functions import attention, subsequent_mask
from gpt_model import *
import math, copy, time

In [13]:
def make_model(vocab, N=12, 
			   d_model=512, d_ff=2048, h=8, dropout=0.1):
	"""Helper: Construct a model from hyperparameters."""

	## returns EncoderDecoder object
	c = copy.deepcopy
	attn = MultiHeadedAttention(h, d_model)
	ff = PositionwiseFeedForward(d_model, d_ff, dropout)
	position = PositionalEncoding(d_model, dropout)
	model = GPT(Decoder(DecoderLayer(d_model, c(attn), c(ff), dropout), N),
		## Sequential passes input to the forward() method in the first module it stores
		## and then "chains" outputs to inputs sequentially for subsequent modules,
		nn.Sequential(Embeddings(d_model, vocab), c(position)),
		Generator(d_model, vocab))
	
	# This was important from their code. 
	# Initialize parameters with Glorot / fan_avg.
	for p in model.parameters():
		if p.dim() > 1:
			nn.init.xavier_uniform_(p) # what does this do? How does it modify model?
	return model

Optimizer, loss function

In [67]:
class NoamOpt:
	#"Optim wrapper that implements rate."
	def __init__(self, model_size, factor, warmup, optimizer):
		self.optimizer = optimizer
		self._step = 0
		self.warmup = warmup
		self.factor = factor
		self.model_size = model_size
		self._rate = 0
		
	def step(self):
		# "Update parameters and rate"
		self._step += 1
		rate = self.rate()
		for p in self.optimizer.param_groups:
			p['lr'] = rate
		self._rate = rate
		self.optimizer.step()
		
	def rate(self, step = None):
		# "Implement `lrate` above"
		if step is None:
			step = self._step
		return self.factor * \
			(self.model_size ** (-0.5) *
			min(step ** (-0.5), step * self.warmup ** (-1.5)))


class LabelSmoothing(nn.Module):
	# "Implement label smoothing."
	def __init__(self, size, padding_idx, smoothing=0.0):
		super(LabelSmoothing, self).__init__()
		self.criterion = nn.KLDivLoss(size_average=False) # Kullback-Leibler divergence loss
		self.padding_idx = padding_idx
		self.confidence = 1.0 - smoothing
		self.smoothing = smoothing
		self.size = size
		self.true_dist = None
		
	def forward(self, x, target):
		assert x.size(1) == self.size
		true_dist = x.data.clone()
		true_dist.fill_(self.smoothing / (self.size - 2))
		# print(true_dist)
		true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
		# print(true_dist)
		true_dist[:, self.padding_idx] = 0
		mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
		if mask.dim() > 0:
			true_dist.index_fill_(0, mask.squeeze(), 0.0)
		self.true_dist = true_dist.requires_grad_(False)
		# return self.criterion(x, Variable(true_dist, requires_grad=False))
		return self.criterion(x, true_dist)
  
  
class SimpleLossCompute:
	# "A simple loss compute and train function."
	def __init__(self, generator, criterion, opt=None):
		self.generator = generator
		self.criterion = criterion # LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
		self.opt = opt # NoamOpt(model.src_embed[0].d_model, 1, 400, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
		
	def __call__(self, x, y, norm):
		x = self.generator(x) # x is output, each element now in d_vocab dimensions, shape = [30, 9, 11]
							  # y is batch.trg_y (first column of 1s removed), shape = [30, 9]
							  # norm is batch.ntokens (270)
		
		loss = self.criterion(x.contiguous().view(-1, x.size(-1)), # shape = [270, 11]
							  y.contiguous().view(-1)) / norm # shape = [270]
		# print("Label Smoothing called")
		loss.backward()
		if self.opt is not None:
			self.opt.step()
			self.opt.optimizer.zero_grad()

		if list(loss.data.size()) != []:
			return loss.data[0] * norm
		else:
			return loss.data * norm


Make model

In [64]:
V = voc.num_words
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, N=12)
## uses pytorch's Adam optimizer
model_opt = NoamOpt(model.embed[0].d_model, 1, 4000,
		torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))



In [74]:
def run_epoch(data_iterator, model, loss_compute):
	"""Standard Training and Logging Function"""
	start = time.time()
	total_tokens = 0
	total_loss = 0
	tokens = 0
 
	for i, batch in enumerate(data_iterator):
		data = batch[0].T
		trg = batch[1].T
		print("data: ",i, [voc.to_word(index.item()) for index in data[0]])
		mask = subsequent_mask(data.size(1))
		out = model.forward(data, mask)
		loss = loss_compute(out, trg, V)
		total_loss += loss
		total_tokens += V
		tokens += V
		if i % 50 == 1:
			elapsed = time.time() - start
			print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
					(i, loss / V, tokens / elapsed))
			start = time.time()
			tokens = 0
	return total_loss / total_tokens

In [None]:
train_iterator, valid_iterator, test_iterator = create_iterators()

dl = iter(train_iterator)

data, targets=next(dl)
lines = data.T
for l in range(len(lines)):
    print(l, [voc.to_word(index.item()) for index in lines[l]])

data, targets=next(dl)
lines = data.T
for l in range(len(lines)):
    print(l, [voc.to_word(index.item()) for index in lines[l]])

for i, batch in enumerate(train_iterator):
  # print(len(batch))
  data = batch[0].T
  trg = batch[1].T
  print(data.shape)
  for l in range(len(data)):
    print("data: ",i, [voc.to_word(index.item()) for index in data[l]])
    print("trgt: ",i, [voc.to_word(index.item()) for index in trg[l]])

In [75]:
train_iterator, valid_iterator, test_iterator = create_iterators()

for epoch in range(20):
  model.train() ## calls nn.Module.train() which sets mode to train
  run_epoch(train_iterator, model, SimpleLossCompute(model.generator, criterion, model_opt))
  model.eval() ## sets mode to testing (i.e. train=False). Layers like dropout behave differently depending on if mode is train or testing.
  run_epoch(valid_iterator, model, SimpleLossCompute(model.generator, criterion, None))


data:  0 ['<BOS>', 'spin', '-', 'entangled', 'currents', 'created', 'by', 'a', 'triple', 'quantum', 'dot', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
data:  1 ['<BOS>', 'build', '-', 'up', 'of', 'vibron', '-', 'mediated', 'electron', 'correlations', 'in', 'molecular', 'junctions', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
Epoch Step: 1 Loss: 1.024948 Tokens per Sec: 572.757068
data:  2 ['<BOS>', 'graphene', 'and', 'graphane', ':', 'new', 'stars', 'of', 'nanoscale', 'electronics', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
data:  3 ['<BOS>', 'bethe', 'ansatz', 'equations', 'for', 'bariev', "'s", 'correlated', 'electron', 'chain', 'with', 'boundaries', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
data:  4 ['<BOS>', 

In [76]:
def greedy_decode(model, max_len, start_symbol):
	ys = torch.ones(1, 1).fill_(start_symbol).long()
	for i in range(max_len-1):
		out = model.forward(ys, subsequent_mask(ys.size(1)))
		prob = model.generator(out[:, -1])
		_, next_word = torch.max(prob, dim = 1)
		# print(next_word)
		next_word = next_word.data[0]
		# print(voc.to_word(next_word.item()))
		ys = torch.cat([ys, 
						torch.ones(1, 1).long().fill_(next_word)], dim=1)
	# print(ys)
	print([voc.to_word(index.item()) for index in ys[0]])
	return ys

In [79]:
model.eval()
greedy_decode(model, 30, 1)

['<BOS>', 'electron', '-', 'phonon', 'coupling', 'and', 'spin', '-', 'orbit', 'effects', 'in', 'electron', '-', 'hole', 'systems', ':', 'consequences', 'for', 'electron', '-', 'doped', '$', 'perovskites', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>']


tensor([[   1,    5,    6,   87,   76,   28,   74,    6,   75,  178,   33,    5,
            6,   99,  254,   21, 1967,   98,    5,    6,  401,  385,  921,    2,
            2,    2,    2,    2,    2,    2]])

In [78]:
greedy_decode(model, 20, 6)

['-', 'coupling', 'and', 'coulomb', 'repulsion', 'in', 'strongly', 'correlated', 'electron', '-', 'hole', 'systems', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>', '<EOS>']


tensor([[  6,  76,  28,  77, 579,  33, 297, 298,   5,   6,  99, 254,   2,   2,
           2,   2,   2,   2,   2,   2]])