# seq2seq: Data Preparation

Abdulhakim Alnuqaydan, Ali Kadhim, Sergei Gleyzer, Harrison Prosper

July 2021

This notebook performs the following tasks:
  1. Read the sequence pairs from __data/seq2seq_data.txt__.
  1. Exclude sequences with complex numbers and with Taylor series expansions longer than 1000 characters.
  1. Write the filtered sequences to __data/seq2seq_data_count.txt__, where count is either 10,000 or 60,000 sequences.
  1. Read filtered data and delimit source (i.e, input) and target (i.e., output) sequences with a tab and newline at the start and end of each sequence, respectively.

In [1]:
import re
import sympy as sp
import numpy as np
import torch

# symbolic symbols
from sympy import exp, \
    cos, sin, tan, \
    cosh, sinh, tanh, ln
x = sp.Symbol('x')

from IPython.display import display
    
# enable pretty printing of equations
sp.init_printing(use_latex='mathjax')

### Filter sequences

In [2]:
of_order = re.compile(' [+] O[(]x[*][*]5.*[)]')
add_count= re.compile('_data')
def filterData(inpfile='data/seq2seq_data.txt',
               num_seq=60000, # number of sequences
               max_len=260): # maximum length of target sequences
    
    # eliminate instances involving complex numbers
    data = filter(lambda d: d.find('I') < 0, open(inpfile).readlines())
    data = list(data)
 
    # keep expansions that are less than maxlen characters long
    data = filter(lambda d: len(d) < max_len, data)
    data = list(data)
    
    # strip away O(...), that is, of order..
    data = [of_order.sub('', d) for d in data]

    N = min(num_seq, len(data))
    outfile = add_count.sub('_data_%d' % N, inpfile)
    print('output file:', outfile)
    open(outfile, 'w').writelines(data[:N])
    
filterData(num_seq=10000, max_len=260)
filterData(num_seq=60000, max_len=260)

output file: data/seq2seq_data_10000.txt
output file: data/seq2seq_data_60000.txt


### Map sequences to lists of indices

  1. Split data into a train, validation, and test set.
  1. Create a token (i.e., a character) to index map from training data.
  1. Map sequences to arrays of indices.
  1. Implement custom DataLoader.

In [8]:
%%writefile seq2sequtil.py
import numpy as np
import torch
from IPython.display import display

# symbolic symbols
from sympy import Symbol, exp, \
    cos, sin, tan, \
    cosh, sinh, tanh, ln
x = Symbol('x')

class Seq2SeqDataPreparer:
    '''
    This class maps the source (i.e., input) and target (i.e, output) 
    sequences of characters into sequences of indices. The source data 
    are split into x_train, x_valid, and x_test sets and similarly for 
    the target data.
    
    Create a data preparer using
    
    dd = Seq2SeqDataPreparer(X, Y, fractions)
    
    where,

      fractions:    a 2-tuple containing the three-way split of data.
                    e.g.: (5/6, 5.5/6) means split the data as follows
                    (50000, 5000, 5000)
    '''
    def __init__(self, X, Y,
                 fractions=[5/6,5.5/6], 
                 max_batch_size=64):
        
        self.fractions = fractions
        self.max_batch_size = max_batch_size
        
        # get maximum sequence length for input expressions
        self.x_max_seq_len =  max([len(z) for z in X])
        
        # get maximum sequence length for target expressions
        self.y_max_seq_len =  max([len(z) for z in Y])
        
        # get length of splits into train, valid, test
        N = int(len(X)*fractions[0])
        M = int(len(X)*fractions[1])
        
        # create token to index map for source sequences
        t = self.token_tofrom_index(X[:N])
        self.x_token2index, self.x_index2token = t
        
        # create token to index map for target sequences
        t = self.token_tofrom_index(Y[:N])
        self.y_token2index,self.y_index2token = t
        
        # structure data into a list of blocks, where each block
        # comprises a tuple (x_data, y_data) whose elements have
        #   x_data.shape: (x_max_seq_len, block_size)
        #   y_data.shape: (y_max_seq_len, block_size)
        # and max_batch_size < block_size.
        self.train_data,self.n_train= self.code_data(X[:N], Y[:N])         
        self.valid_data,self.n_valid= self.code_data(X[N:M],Y[N:M])
        self.test_data,self.n_test  = self.code_data(X[M:], Y[M:])

    def __del__(self):
        pass
    
    def __len__(self):
        n  = 0
        n += self.n_train
        n += self.n_valid
        n += self.n_test
        return n
    
    def __str__(self):
        s  = ''
        s += 'number of seq-pairs (train): %8d\n' % self.n_train
        s += 'number of seq-pairs (valid): %8d\n' % self.n_valid
        s += 'number of seq-pairs (test):  %8d\n' % self.n_test
        s += '\n'
        s += 'number of source tokens:     %8d\n' % \
        len(self.x_token2index)
        s += 'max source sequence length:  %8d\n' % \
        self.x_max_seq_len
        
        try:
            s += '\n'
            s += 'number of target tokens:     %8d\n' % \
            len(self.y_token2index)
            s += 'max target sequence length:  %8d' % \
            self.y_max_seq_len
        except:
            pass
        return s
         
    def num_tokens(self, which='source'):
        if which[0] in ['s', 'i']:
            return len(self.x_token2index)
        else:
            return len(self.y_token2index)
    
    def max_seq_len(self, which='source'):
        if which[0] in ['s', 'i']:
            return self.x_max_seq_len
        else:
            return self.y_max_seq_len
        
    def decode(self, indices):
        # map list of indices to a list of tokens
        return ''.join([self.y_index2token[i] for i in indices])

    def token_tofrom_index(self, expressions):
        chars = set()
        chars.add(' ')  # for padding
        chars.add('?')  # for unknown characters
        for expression in expressions:
            for char in expression:
                chars.add(char)
        chars = sorted(list(chars))
        
        char2index = dict([(char, i) for i, char in enumerate(chars)])
        index2char = dict([(i, char) for i, char in enumerate(chars)])
        return (char2index, index2char)
        
    def code_data(self, X, Y):
        # X, Y consist of delimited strings: 
        #   \tab<characters\newline
        
        # loop over sequence pairs and convert them to sequences
        # of integers using the two token2index maps
      
        x_space   = self.x_token2index[' ']
        x_unknown = self.x_token2index['?']
        
        y_space   = self.y_token2index[' ']
        y_unknown = self.y_token2index['?']
        
        cdata     = []  
        for i, (x_expression, y_expression) in enumerate(zip(X, Y)):
            
            # ------------------------------------------
            # map source characters to integers
            # ------------------------------------------
            x_n = len(x_expression)
            source = [0] * x_n 
            for t, char in enumerate(x_expression):
                try:
                    source[t] = self.x_token2index[char]
                except:
                    source[t] = x_unknown
            
            # ------------------------------------------
            # map target characters to integers
            # ------------------------------------------
            y_n = len(y_expression)
            target = [0] * y_n 
            for t, char in enumerate(y_expression):
                try:
                    target[t] = self.y_token2index[char]
                except:
                    target[t] = y_unknown
                    
            # Structure data as a list of 4-tuples, with the first
            # element of the tuple the length of the target sequence,
            # which, in this example, tend to be longer than the
            # source sequences. We'll sort the 4-tuples into
            # ascending order of target sequence length.
            cdata.append((y_n, x_n, target, source))
         
        # ---------------------------------------------------------
        # Group data according to length of target sequence
        # ---------------------------------------------------------
        # 1. Sort sequence pairs according to target sequence lengths    
        cdata.sort() 
        
        # 2. Compute number of blocks of data, n_blocks, each of 
        #    whose target sequences will have roughly the same 
        #    sequence length.
 
        n_data   = len(cdata)            # number of sequence pairs
        l_block  = self.max_batch_size + 16   # length of blocks
        n_blocks = int(n_data / l_block) # number of blocks
        
        # Note: the last block will, in general, have a length >= 
        #       to the length of the other blocks.
        
        # 3. Loop over blocks and and pad sequences so that all
        #    sequences within a block are of the same length. 
        #    Do this separately for source and target sequences. 
        #    The shape of each block is (max_seq_len, block-size), 
        #    where max_seq_len can change from block to block.
    
        blocks = [0] * n_blocks
        for k in range(n_blocks):
            
            # get block k
            
            i = k * l_block              # start of block k
            if k < n_blocks - 1:
                j = i + l_block - 1      # end of block k
            else:
                j = n_data - 1

            block = cdata[i:j]
 
            # get minimum and maximum length of target sequences
            # for current block
            y_min_seq_len, _, _, _ = block[0]
            y_max_seq_len, _, _, _ = block[-1]
        
            # for current block source sequences need not be 
            # ordered so get maximum of all 
            # source sequences for current block.
            x_min_seq_len = min([x_len for _, x_len, _, _ in block])
            x_max_seq_len = max([x_len for _, x_len, _, _ in block])
   
            # loop over sequence pairs in current block
            # and pad them to the same length, separately for the
            # source and target sequences
           
            # create empty arrays for sequences
            block_len = len(block)
            
            x_data = np.zeros((x_max_seq_len, block_len), dtype='long')
            y_data = np.zeros((y_max_seq_len, block_len), dtype='long')
    
            for j, (y_seq_len, x_seq_len, 
                    y_seq, x_seq) in enumerate(block):
            
                # copy source data to 2D arrays
                for t, c in enumerate(x_seq): 
                    x_data[t, j] = c
                    
                # pad source data
                if x_seq_len < x_max_seq_len: 
                    x_data[t + 1:, j] = x_space
                
                # copy target data to 2D arrays
                for t, c in enumerate(y_seq): 
                    y_data[t, j] = c
                    
                # pad array data
                if y_seq_len < y_max_seq_len: 
                    y_data[t + 1:, j] = y_space
            
            # cache padded data
            blocks[k] = (x_data, y_data)
            
            #print('%5d\t%5d %5d\t%5d %5d' % \
            #      (k, 
            #       x_min_seq_len, x_max_seq_len,
            #       y_min_seq_len, y_max_seq_len))

        return blocks, n_data
    
class Seq2SeqDataLoader:
    '''
    dataloader = Seq2seqDataLoader(dataset, device, batch_size=64)    
    '''
    def __init__(self, dataset, device,
                 batch_size=64): 
        self.dataset    = dataset
        self.device     = device
        self.batch_size = batch_size
        self.count      = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        self.count += 1
        if self.count <= len(self.dataset):
            # 1. randomly pick a block
            k  = np.random.randint(len(self.dataset))
            
            # 2. get its block_size > batch_size
            block_size = self.dataset[k][0].shape[-1]
            assert block_size > self.batch_size
            
            # 3. randomly pick a unique set of "batch_size"
            #    integers from the array of integers jj
            jj = np.arange(block_size)
            ii = np.random.choice(jj, self.batch_size, replace=False)
            
            # 4. create tensors directly on the device of interest
            X = torch.tensor(self.dataset[k][0][:, ii], 
                             device=self.device)
            
            Y = torch.tensor(self.dataset[k][1][:, ii], 
                             device=self.device)
        
            # shape of X and Y: (max_seq_len, batch_size)
            return X, Y
        else:
            self.count = 0
            raise StopIteration
        
# Delimit each sequence in filtered sequences
# The start of sequence (SOS) and end of sequence (EOS) 
# tokens are "\t" and "\n", respectively.

def loadData(inpfile):
    # format of data:
    # input expression<tab>target expression<newline>
    data = [a.split('\t') for a in open(inpfile).readlines()]
    
    X, Y = [], []
    for i, (x, y) in enumerate(data):
        X.append('\t%s\n' % x)
        # get rid of spaces in target sequence
        y = ''.join(y.split())
        Y.append('\t%s\n' % y)
        
    print('Example source:')
    print(X[-1])
    pprint(X[-1])
    print('Example target:')
    print(Y[-1])
    pprint(Y[-1])

    return (X, Y)

def pprint(expr):
    display(eval(expr))

Overwriting seq2sequtil.py


#### Display a few sequence pairs

In [9]:
import seq2sequtil as sq
import importlib
importlib.reload(sq)
inputs, targets = sq.loadData('data/seq2seq_data_10000.txt')
print(inputs[8000])
print(targets[8000])

Example source:
	-(9*x/3)*tanh(-2*x**3+9)



         ⎛       3⎞
-3⋅x⋅tanh⎝9 - 2⋅x ⎠

Example target:
	-3*x*tanh(9)+x**4*(6-6*tanh(9)**2)



 4 ⎛          2   ⎞              
x ⋅⎝6 - 6⋅tanh (9)⎠ - 3⋅x⋅tanh(9)

	(9*x**3+6)*cos(-8*x**3/7)/sin(2*x**3-3)

	-6/sin(3)+x**3*(-9/sin(3)-12*cos(3)/sin(3)**2)



### Check data preparer

In [10]:
fractions=[8/10, 9/10]
db = sq.Seq2SeqDataPreparer(inputs, targets, fractions)
print(db)

number of seq-pairs (train):     8000
number of seq-pairs (valid):     1000
number of seq-pairs (test):      1000

number of source tokens:           31
max source sequence length:        81

number of target tokens:           35
max target sequence length:       214


### Check data loader 

In [11]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader = sq.Seq2SeqDataLoader(db.train_data, device)

for i, (X, Y) in enumerate(train_loader):
    print('%5d\t%s' % (i+1, Y.shape))

    1	torch.Size([78, 64])
    2	torch.Size([157, 64])
    3	torch.Size([46, 64])
    4	torch.Size([184, 64])
    5	torch.Size([64, 64])
    6	torch.Size([22, 64])
    7	torch.Size([143, 64])
    8	torch.Size([89, 64])
    9	torch.Size([48, 64])
   10	torch.Size([67, 64])
   11	torch.Size([29, 64])
   12	torch.Size([9, 64])
   13	torch.Size([108, 64])
   14	torch.Size([48, 64])
   15	torch.Size([118, 64])
   16	torch.Size([125, 64])
   17	torch.Size([106, 64])
   18	torch.Size([97, 64])
   19	torch.Size([30, 64])
   20	torch.Size([61, 64])
   21	torch.Size([35, 64])
   22	torch.Size([38, 64])
   23	torch.Size([48, 64])
   24	torch.Size([26, 64])
   25	torch.Size([111, 64])
   26	torch.Size([85, 64])
   27	torch.Size([99, 64])
   28	torch.Size([67, 64])
   29	torch.Size([43, 64])
   30	torch.Size([128, 64])
   31	torch.Size([66, 64])
   32	torch.Size([19, 64])
   33	torch.Size([42, 64])
   34	torch.Size([80, 64])
   35	torch.Size([31, 64])
   36	torch.Size([45, 64])
   37	torch.Size([82