# seq2seq: Data Preparation

Abdulhakim Alnuqaydan, Ali Kadhim, Sergei Gleyzer, Harrison Prosper

July 2021

This notebook performs the following tasks:
  1. Read the sequence pairs from __data/seq2seq_data.txt__.
  1. Exclude sequences with complex numbers and with Taylor series expansions longer than 1000 characters.
  1. Write the filtered sequences to __data/seq2seq_data_count.txt__, where <count> is either 10,000 or 60,000 sequences.
  1. Read filtered data and delimit source (i.e, input) and target (i.e., output) sequences with a tab and newline at the start and end of each sequence, respectively.

In [1]:
import re
import sympy as sp
import numpy as np

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# symbolic symbols
from sympy import exp, \
    cos, sin, tan, \
    cosh, sinh, tanh, ln
x = sp.Symbol('x')

from IPython.display import display
    
# enable pretty printing of equations
sp.init_printing(use_latex='mathjax')

### Filter sequences

In [2]:
of_order = re.compile(' [+] O[(]x[*][*]5.*[)]')
add_count= re.compile('_data')
def filterData(inpfile='data/seq2seq_data.txt',
               num_seq=60000, # number of sequences
               max_len=1000): # maximum length of target sequences
    
    # eliminate instances involving complex numbers
    data = filter(lambda d: d.find('I') < 0, open(inpfile).readlines())
    data = list(data)
 
    # keep expansions that are less than maxlen characters long
    data = filter(lambda d: len(d) < max_len, data)
    data = list(data)

    # strip away O(...), that is, of order..
    data = [of_order.sub('', d) for d in data]
                       
    N = min(num_seq, len(data))
    outfile = add_count.sub('_data_%d' % N, inpfile)
    print('output file:', outfile)
    open(outfile, 'w').writelines(data[:N])
    
filterData(num_seq=10000)
filterData(num_seq=60000)

output file: data/seq2seq_data_10000.txt
output file: data/seq2seq_data_60000.txt


### Map sequences to lists of indices

  1. Split data into a train, validation, and test set.
  1. Create a token (i.e., a character) to index map from training data.
  1. Map sequences to arrays of indices.
  1. Implement custom DataLoader.

In [9]:
%%writefile seq2sequtil.py
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from IPython.display import display

# symbolic symbols
from sympy import Symbol, exp, \
    cos, sin, tan, \
    cosh, sinh, tanh, ln
x = Symbol('x')

class Seq2SeqDataPreparer:
    '''
    This class maps the source (i.e., input) and target (i.e, output) 
    sequences of characters into sequences of indices. The source data 
    are split into x_train, x_valid, and x_test sets and similarly for 
    the target data.
    
    Create a data preparer using
    
    dd = Seq2SeqDataPreparer(X, Y, fractions)
    
    where the shape of dd.x_* and dd.y_* is 
       
       (max_seq_len, batch_size)
       
    (* = train, valid, test)
    
    and where,
      size:         number of instances in data set
      max_seq_len:  max sequence length (# characters)
      fractions:    a 2-tuple containing the three-way split of data.
                    e.g.: (5/6, 5.5/6) means split the data as follows
                    (50000, 5000, 5000)
    Note: max_seq_len in general differ for source and target.
    '''
    def __init__(self, X, Y=None, fractions=[5/6,5.5/6]):
        
        # get maximum sequence length for input expressions
        self.x_max_seq_len =  max([len(z) for z in X])
        
        # code data
        N = int(len(X)*fractions[0])
        M = int(len(X)*fractions[1])
        
        # create token to index map from training data
        t = self.token_tofrom_index(X[:N])
        self.x_token2index, self.x_index2token = t
        
        
        self.x_train = self.code_data(X[:N], 
                                      self.x_token2index,
                                      self.x_max_seq_len)
        
        self.x_valid = self.code_data(X[N:M], 
                                      self.x_token2index,
                                      self.x_max_seq_len)
        
        self.x_test  = self.code_data(X[M:], 
                                      self.x_token2index,
                                      self.x_max_seq_len)
        
        if not None:
            self.y_max_seq_len =  max([len(z) for z in Y])
        
            # create token to index map from training data
            t = self.token_tofrom_index(Y[:N])
            self.y_token2index,self.y_index2token = t
            
            self.y_train = self.code_data(Y[:N], 
                                          self.y_token2index, 
                                          self.y_max_seq_len)
        
            self.y_valid = self.code_data(Y[N:M], 
                                          self.y_token2index, 
                                          self.y_max_seq_len)

            self.y_test  = self.code_data(Y[M:], 
                                          self.y_token2index, 
                                          self.y_max_seq_len)
        
    def __del__(self):
        pass
    
    def __len__(self):
        # shape (max_seq_len, size)
        n  = 0
        n += len(self.x_train[1])
        n += len(self.x_valid[1])
        n += len(self.x_test[1])
        return n
    
    def __str__(self):
        s  = ''
        s += 'number of seq-pairs (train): %8d\n'%len(self.x_train[1])
        s += 'number of seq-pairs (valid): %8d\n'%len(self.x_valid[1])
        s += 'number of seq-pairs (test):  %8d\n'%len(self.x_test[1])
        s += '\n'
        s += 'number of source tokens:     %8d\n' % \
        len(self.x_token2index)
        s += 'max source sequence length:  %8d\n' % \
        self.x_max_seq_len
        
        try:
            s += '\n'
            s += 'number of target tokens:     %8d\n' % \
            len(self.y_token2index)
            s += 'max target sequence length:  %8d' % \
            self.y_max_seq_len
        except:
            pass
        return s
         
    def num_tokens(self, which='source'):
        if which[0] in ['s', 'i']:
            return len(self.x_token2index)
        else:
            return len(self.y_token2index)
    
    def max_seq_len(self, which='source'):
        if which[0] in ['s', 'i']:
            return self.x_max_seq_len
        else:
            return self.y_max_seq_len
        
    def decode(self, indices):
        # map list of indices to a list of tokens
        return [self.y_index2token[i] for i in indices]

    def token_tofrom_index(self, expressions):
        chars = set()
        chars.add(' ')  # for padding
        chars.add('?')  # for unknown characters
        for expression in expressions:
            for char in expression:
                chars.add(char)
        chars = sorted(list(chars))
        
        char2index = dict([(char, i) for i, char in enumerate(chars)])
        index2char = dict([(1, char) for i, char in enumerate(chars)])
        return (char2index, index2char)
        
    def code_data(self, data, token2index, maxseqlen):
        
        # shape of data: (max_seq_len, size)
        
        cdata   = np.zeros((maxseqlen, len(data)), dtype='long')
        space   = token2index[' ']
        unknown = token2index['?']
        for i, expression in enumerate(data):
            for t, char in enumerate(expression):
                try:
                    cdata[t, i] = token2index[char]
                except:
                    cdata[t, i] = unknown
        
            # pad with spaces
            cdata[t + 1:, i] = space
        return cdata
    
    
# Dataset class to return source and target "sentences"
class Seq2SeqDataset(Dataset):
    '''
    dataset = Seq2SeqDataset(X, Y)
    
    shape of data: (max_seq_len, size)
    '''
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
 
    def __len__(self):
        return len(self.X[1])
  
    def __getitem__(self, index):
        # shape of output data: (max_seq_len)
        return self.X[:,index], self.Y[:,index]
    
# See tips on how to increase PyTorch performance:
# https://towardsdatascience.com/
# 7-tips-for-squeezing-maximum-performance-from-pytorch-ca4a40951259

class Seq2SeqDataLoader:
    '''
    dataloader = Seq2seqDataLoader(X, Y, batch_size=128, shuffle=True)
    
    '''
    def __init__(self, X, Y, 
                 batch_size=128, 
                 shuffle=False):
        self.dataset    = Seq2SeqDataset(X, Y)
        self.batch_size = batch_size
        self.shuffle    = shuffle
        self.dataloader = DataLoader(self.dataset, 
                                     batch_size=batch_size, 
                                     shuffle=shuffle,
                                     pin_memory=True)
        self.iter = iter(self.dataloader)
        
    def __iter__(self):
        return self
    
    def __next__(self):
        try:
            # If GPU is being used, even though the memory is pinned,
            # we may still have to transfer these to the GPU explicitly
            X, Y  = self.iter.next()
            # need shape: (max_seq_len, batch_size)
            return X.transpose(0,1), Y.transpose(0,1)
        except:
            raise StopIteration
            
    def reset(self):
        self.iter = iter(self.dataloader)
        
# Delimit each sequence in filtered sequences
# The start of sequence (SOS) and end of sequence (EOS) 
# tokens are "\t" and "\n", respectively.

def loadData(inpfile):
    # format of data:
    # input expression<tab>target expression<newline>
    data = [a.split('\t') for a in open(inpfile).readlines()]
    
    X, Y = [], []
    for i, (x, y) in enumerate(data):
        X.append('\t%s\n' % x)
        # get rid of spaces in target sequence
        y = ''.join(y.split())
        Y.append('\t%s\n' % y)
        if i % 2000 == 0:
            print(i)
            # pretty print expressions
            pprint(X[-1])
            pprint(Y[-1])
            print()
    return (X, Y)

def pprint(expr):
    display(eval(expr))

Overwriting seq2sequtil.py


#### Display a few sequence pairs

In [4]:
import seq2sequtil as sq
inputs, targets = sq.loadData('data/seq2seq_data_10000.txt')
print(inputs[8000])
print(targets[8000])

0


-sinh(2⋅x)

     3      
  4⋅x       
- ──── - 2⋅x
   3        


2000


        2              
 4 - 9⋅x     ⎛   2    ⎞
ℯ        ⋅cos⎝7⋅x  - 1⎠

 4 ⎛      4              4       ⎞    2 ⎛     4             4       ⎞    4    
x ⋅⎝- 63⋅ℯ ⋅sin(1) + 16⋅ℯ ⋅cos(1)⎠ + x ⋅⎝- 9⋅ℯ ⋅cos(1) + 7⋅ℯ ⋅sin(1)⎠ + ℯ ⋅cos

   
(1)


4000


            5⋅x - 2   ⎛   3    ⎞     ⎛   2    ⎞
(-2⋅x - 6)⋅ℯ        + ⎝2⋅x  - 7⎠⋅tanh⎝7⋅x  - 5⎠

   ⎛                     -2               ⎞                                   
 4 ⎜               2375⋅ℯ             3   ⎟    3 ⎛       -2            ⎞    2 
x ⋅⎜-343⋅tanh(5) - ──────── + 343⋅tanh (5)⎟ + x ⋅⎝- 150⋅ℯ   - 2⋅tanh(5)⎠ + x ⋅
   ⎝                  12                  ⎠                                   

                                                           
⎛          -2          2   ⎞         -2      -2            
⎝-49 - 85⋅ℯ   + 49⋅tanh (5)⎠ - 32⋅x⋅ℯ   - 6⋅ℯ   + 7⋅tanh(5)
                                                           


6000


   ⎛x⎞               
cos⎜─⎟ - tan(9⋅x - 3)
   ⎝3⎠               

 4 ⎛                       3              5                         ⎞    3 ⎛  
x ⋅⎝4374⋅tan(3) + 10935⋅tan (3) + 6561⋅tan (3) + 0.00051440329218107⎠ + x ⋅⎝-2

            2             4   ⎞    2 ⎛                  3                     
43 - 972⋅tan (3) - 729⋅tan (3)⎠ + x ⋅⎝81⋅tan(3) + 81⋅tan (3) - 0.0555555555555

   ⎞     ⎛          2   ⎞             
556⎠ + x⋅⎝-9 - 9⋅tan (3)⎠ + tan(3) + 1


8000


             ⎛ 3⎞
 -5⋅x - 1    ⎜x ⎟
ℯ        ⋅cos⎜──⎟
             ⎝3 ⎠

     4  -1        3  -1       2  -1                
625⋅x ⋅ℯ     125⋅x ⋅ℯ     25⋅x ⋅ℯ          -1    -1
────────── - ────────── + ───────── - 5⋅x⋅ℯ   + ℯ  
    24           6            2                    


	cos(3*x**3/9)/exp(5*x+1)

	exp(-1)-5*x*exp(-1)+25*x**2*exp(-1)/2-125*x**3*exp(-1)/6+625*x**4*exp(-1)/24



### Check data preparer

In [5]:
fractions=[8/10, 9/10]
db = sq.Seq2SeqDataPreparer(inputs, targets, fractions)
print(db)

number of seq-pairs (train):     8000
number of seq-pairs (valid):     1000
number of seq-pairs (test):      1000

number of source tokens:           31
max source sequence length:        81

number of target tokens:           35
max target sequence length:       883


### Check data loader and its reset() method

In [6]:
N_TRAIN = 100
BATCH_SIZE = 10
train_loader = sq.Seq2SeqDataLoader(db.x_train[:,:N_TRAIN], 
                                    db.y_train[:,:N_TRAIN], 
                                    BATCH_SIZE)

In [7]:
train_loader.reset()
for i, (X, Y) in enumerate(train_loader):
    print(X.shape)
    print(X[:5,:])
    if i >= 1: break

torch.Size([81, 10])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [28,  3,  3, 28, 28,  7,  7, 24,  7,  3],
        [23, 14, 14, 23, 23, 21,  3, 25, 21, 10],
        [25,  5,  5, 25, 25, 30, 16,  3, 30,  5],
        [22, 30, 30,  3, 22, 27,  5, 15, 27, 30]])
torch.Size([81, 10])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 3,  3,  7,  7,  7, 28, 28,  7,  7,  7],
        [17, 14,  3,  3,  3, 23, 23,  3, 20,  3],
        [ 5,  5, 12, 14, 10, 25, 25, 11, 26, 13],
        [30, 30,  5,  5,  5,  3, 22,  5, 28,  5]])


In [8]:
train_loader.reset()
for i, (X, Y) in enumerate(train_loader):
    print(X.shape)
    print(X[:5,:])
    if i >= 1: break

torch.Size([81, 10])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [28,  3,  3, 28, 28,  7,  7, 24,  7,  3],
        [23, 14, 14, 23, 23, 21,  3, 25, 21, 10],
        [25,  5,  5, 25, 25, 30, 16,  3, 30,  5],
        [22, 30, 30,  3, 22, 27,  5, 15, 27, 30]])
torch.Size([81, 10])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 3,  3,  7,  7,  7, 28, 28,  7,  7,  7],
        [17, 14,  3,  3,  3, 23, 23,  3, 20,  3],
        [ 5,  5, 12, 14, 10, 25, 25, 11, 26, 13],
        [30, 30,  5,  5,  5,  3, 22,  5, 28,  5]])
