# seq2seq: Data Preparation

Abdulhakim Alnuqaydan, Ali Kadhim, Sergei Gleyzer, Harrison Prosper

July 2021

This notebook performs the following tasks:
  1. Read the sequence pairs from __data/seq2seq_data.txt__.
  1. Exclude sequences with complex numbers and with Taylor series expansions longer than 1000 characters.
  1. Write the filtered sequences to __data/seq2seq_data_count.txt__, where count is either 10,000 or 60,000 sequences.
  1. Read filtered data and delimit source (i.e, input) and target (i.e., output) sequences with a tab and newline at the start and end of each sequence, respectively.

In [1]:
import re
import sympy as sp
import numpy as np

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# symbolic symbols
from sympy import exp, \
    cos, sin, tan, \
    cosh, sinh, tanh, ln
x = sp.Symbol('x')

from IPython.display import display
    
# enable pretty printing of equations
sp.init_printing(use_latex='mathjax')

### Filter sequences

In [2]:
of_order = re.compile(' [+] O[(]x[*][*]5.*[)]')
add_count= re.compile('_data')
def filterData(inpfile='data/seq2seq_data.txt',
               num_seq=60000, # number of sequences
               max_len=260): # maximum length of target sequences
    
    # eliminate instances involving complex numbers
    data = filter(lambda d: d.find('I') < 0, open(inpfile).readlines())
    data = list(data)
 
    # keep expansions that are less than maxlen characters long
    data = filter(lambda d: len(d) < max_len, data)
    data = list(data)
    
    # strip away O(...), that is, of order..
    data = [of_order.sub('', d) for d in data]

    N = min(num_seq, len(data))
    outfile = add_count.sub('_data_%d' % N, inpfile)
    print('output file:', outfile)
    open(outfile, 'w').writelines(data[:N])
    
filterData(num_seq=10000, max_len=260)
filterData(num_seq=60000, max_len=260)

output file: data/seq2seq_data_10000.txt
output file: data/seq2seq_data_60000.txt


### Map sequences to lists of indices

  1. Split data into a train, validation, and test set.
  1. Create a token (i.e., a character) to index map from training data.
  1. Map sequences to arrays of indices.
  1. Implement custom DataLoader.

In [3]:
%%writefile seq2sequtil.py
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from IPython.display import display

# symbolic symbols
from sympy import Symbol, exp, \
    cos, sin, tan, \
    cosh, sinh, tanh, ln
x = Symbol('x')

class Seq2SeqDataPreparer:
    '''
    This class maps the source (i.e., input) and target (i.e, output) 
    sequences of characters into sequences of indices. The source data 
    are split into x_train, x_valid, and x_test sets and similarly for 
    the target data.
    
    Create a data preparer using
    
    dd = Seq2SeqDataPreparer(X, Y, fractions)
    
    where the shape of dd.x_* and dd.y_* is 
       
       (max_seq_len, batch_size)
       
    (* = train, valid, test)
    
    and where,
      size:         number of instances in data set
      max_seq_len:  max sequence length (# characters)
      fractions:    a 2-tuple containing the three-way split of data.
                    e.g.: (5/6, 5.5/6) means split the data as follows
                    (50000, 5000, 5000)
    Note: max_seq_len in general differ for source and target.
    '''
    def __init__(self, X, Y,
                 fractions=[5/6,5.5/6], 
                 max_batch_size=128):
        
        self.fractions = fractions
        self.max_batch_size = max_batch_size
        
        # get maximum sequence length for input expressions
        self.x_max_seq_len =  max([len(z) for z in X])
        
        # get maximum sequence length for target expressions
        self.y_max_seq_len =  max([len(z) for z in Y])
        
        # code data
        N = int(len(X)*fractions[0])
        M = int(len(X)*fractions[1])
        
        # create token to index map for source sequences
        t = self.token_tofrom_index(X[:N])
        self.x_token2index, self.x_index2token = t
        
        # create token to index map for target sequences
        t = self.token_tofrom_index(Y[:N])
        self.y_token2index,self.y_index2token = t
        
        # structure data into a list of blocks, where each block
        # comprises a tuple (x_data, y_data) whose elements have
        #   x_data.shape: (x_max_seq_len, block_size)
        #   y_data.shape: (y_max_seq_len, block_size)
        # and block_size <= max_batch_size.
        self.train_data = self.code_data(X[:N],  Y[:N])         
        self.valid_data = self.code_data(X[N:M], Y[N:M])
        self.test_data  = self.code_data(X[M:],  Y[M:])

    def __del__(self):
        pass
    
    def __len__(self):
        # shape (max_seq_len, size)
        n  = 0
        n += len(self.train_data)
        n += len(self.valid_data)
        n += len(self.test_data)
        return n
    
    def __str__(self):
        s  = ''
        s += 'number of seq-pairs (train): %8d\n'%len(self.x_train)
        s += 'number of seq-pairs (valid): %8d\n'%len(self.x_valid)
        s += 'number of seq-pairs (test):  %8d\n'%len(self.x_test)
        s += '\n'
        s += 'number of source tokens:     %8d\n' % \
        len(self.x_token2index)
        s += 'max source sequence length:  %8d\n' % \
        self.x_max_seq_len
        
        try:
            s += '\n'
            s += 'number of target tokens:     %8d\n' % \
            len(self.y_token2index)
            s += 'max target sequence length:  %8d' % \
            self.y_max_seq_len
        except:
            pass
        return s
         
    def num_tokens(self, which='source'):
        if which[0] in ['s', 'i']:
            return len(self.x_token2index)
        else:
            return len(self.y_token2index)
    
    def max_seq_len(self, which='source'):
        if which[0] in ['s', 'i']:
            return self.x_max_seq_len
        else:
            return self.y_max_seq_len
        
    def decode(self, indices):
        # map list of indices to a list of tokens
        return [self.y_index2token[i] for i in indices]

    def token_tofrom_index(self, expressions):
        chars = set()
        chars.add(' ')  # for padding
        chars.add('?')  # for unknown characters
        for expression in expressions:
            for char in expression:
                chars.add(char)
        chars = sorted(list(chars))
        
        char2index = dict([(char, i) for i, char in enumerate(chars)])
        index2char = dict([(i, char) for i, char in enumerate(chars)])
        return (char2index, index2char)
        
    def code_data(self, X, Y):
        # X, Y consist of delimited strings: \tab<characters\newline
        
        # loop over sequence pairs and convert them to sequences
        # of integers using the two token2index maps
      
        x_unknown = self.x_token2index['?']
        y_unknown = self.y_token2index['?']
        
        cdata     = []  
        for i, (x_expression, y_expression) in enumerate(zip(X, Y)):
            
            # ------------------------------------------
            # map source characters to integers
            # ------------------------------------------
            x_n = len(x_expression)
            source = [0] * x_n 
            for t, char in enumerate(x_expression):
                try:
                    source[t] = self.x_token2index[char]
                except:
                    source[t] = x_unknown
            
            # ------------------------------------------
            # map target characters to integers
            # ------------------------------------------
            y_n = len(y_expression)
            target = [0] * x_n 
            for t, char in enumerate(y_expression):
                try:
                    target[t] = self.y_token2index[char]
                except:
                    target[t] = y_unknown
                    
            # Structure data as a list of 4-tuples, with the first
            # element of the tuple the length of the target sequence,
            # which, in this example, tend to be longer than the
            # source sequences. We'll sort the 4-tuples into
            # ascending order of target sequence length.
            cdata.append((y_n, x_n, target, source))
         
        # ---------------------------------------------------------
        # Group data according to length of target sequence
        # ---------------------------------------------------------
        # 1. Sort sequence pairs according to target sequence lengths    
        cdata.sort() 
        
        # 2. Compute number of blocks of data, n_blocks, each of 
        #    which will have roughly the same sequence lengths.
        n_data   = len(cdata)            # number of sequence pairs
        l_block  = max_batch_size + 20   # length of blocks
        n_blocks = int(n_data / l_block) # number of blocks
        # Note: the last block will, in general, have a length >= 
        #       to the length of the other blocks.
        
        # 3. Loop over blocks and and pad sequences so that all
        #    sequences within a block are of the same length. Do this
        #    separately for source and target sequences. The shape of
        #    each block is (max_seq_len, block-size), where max_seq_len
        #    can change from block to block.
    
        self.blocks = [0] * n_blocks
        for k in range(n_blocks):
            
            # get block k
            
            i = k * l_block              # start of block k
            if k < n_blocks - 1:
                j = i + l_block - 1      # end of block k
            else:
                j = n_data - 1

            block = cdata[i:j]

            # get maximum length of target sequence for current block
            y_max_seq_len, x_max_seq_len, _, _ = block[-1]

            # for current block source sequences need not be ordered
            # the same way as the targets, so get maximum of all 
            # source sequences for current block.
            x_max_seq_len = max([x_len for _, x_len, _, _ in block])
   
            # loop over sequence pairs in current block
            # and pad them to the same length, separately for the
            # source and target sequences
            block_len = len(block)
            x_space   = self.x_token2index[' ']
            y_space   = self.y_token2index[' ']
            
            for i, (y_seq_len, x_seq_len, 
                    y_seq, x_seq) in enumerate(block):
                # ------------------------------------------
                # create an empty array for source sequences
                # ------------------------------------------
                x_data = np.zeros((x_max_seq_len, block_len), 
                                  dtype='long')
                
                # copy source data
                for t, c in enumerate(x_seq): 
                    x_data[t, i] = c
                    
                # pad source data
                if x_seq_len < x_max_seq_len: 
                    x_data[t + 1:, i] = x_space
                    
                # ------------------------------------------
                # create an empty array for target sequences
                # ------------------------------------------
                y_data = np.zeros((y_max_seq_len, block_len), 
                                  dtype='long')
                
                for t, c in enumerate(y_seq): 
                    y_data[t, i] = c
                    
                if y_seq_len < y_max_seq_len: 
                    y_data[t + 1:, i] = y_space
            
            # cache padded data
            blocks[k] = (x_data, y_data)
            print('block shapes', x_data.shape, y_data.shape)
        return blocks
    
# Dataset class to return source and target "sentences"
class Seq2SeqDataset(Dataset):
    '''
    dataset = Seq2SeqDataset(X, Y)
    
    shape of data: (max_seq_len, size)
    '''
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
 
    def __len__(self):
        return len(self.X[1])
  
    def __getitem__(self, index):
        # shape of output data: (max_seq_len)
        return self.X[:,index], self.Y[:,index]
    
# See tips on how to increase PyTorch performance:
# https://towardsdatascience.com/
# 7-tips-for-squeezing-maximum-performance-from-pytorch-ca4a40951259

class Seq2SeqDataLoader:
    '''
    dataloader = Seq2seqDataLoader(X, Y, batch_size=128, shuffle=True)
    
    '''
    def __init__(self, X, Y, 
                 batch_size=128, 
                 shuffle=False):
        self.dataset    = Seq2SeqDataset(X, Y)
        self.batch_size = batch_size
        self.shuffle    = shuffle
        self.dataloader = DataLoader(self.dataset, 
                                     batch_size=batch_size, 
                                     shuffle=shuffle,
                                     pin_memory=True)
        self.iter = iter(self.dataloader)
        
    def __iter__(self):
        return self
    
    def __next__(self):
        try:
            # If GPU is being used, even though the memory is pinned,
            # we may still have to transfer these to the GPU explicitly
            X, Y  = self.iter.next()
            # need shape: (max_seq_len, batch_size)
            return X.transpose(0,1), Y.transpose(0,1)
        except:
            raise StopIteration
            
    def reset(self):
        self.iter = iter(self.dataloader)
        
# Delimit each sequence in filtered sequences
# The start of sequence (SOS) and end of sequence (EOS) 
# tokens are "\t" and "\n", respectively.

def loadData(inpfile):
    # format of data:
    # input expression<tab>target expression<newline>
    data = [a.split('\t') for a in open(inpfile).readlines()]
    
    X, Y = [], []
    for i, (x, y) in enumerate(data):
        X.append('\t%s\n' % x)
        # get rid of spaces in target sequence
        y = ''.join(y.split())
        Y.append('\t%s\n' % y)
        if i % 2000 == 0:
            print(i)
            # pretty print expressions
            pprint(X[-1])
            pprint(Y[-1])
            print()
    return (X, Y)

def pprint(expr):
    display(eval(expr))

Overwriting seq2sequtil.py


#### Display a few sequence pairs

In [4]:
import seq2sequtil as sq
inputs, targets = sq.loadData('data/seq2seq_data_10000.txt')
print(inputs[8000])
print(targets[8000])

0


-sinh(2⋅x)

     3      
  4⋅x       
- ──── - 2⋅x
   3        


2000


         ⎛   3⎞ 
         ⎜7⋅x ⎟ 
-4⋅x⋅tanh⎜────⎟ 
         ⎝ 3  ⎠ 
────────────────
       7        

    4 
-4⋅x  
──────
  3   


4000


   2                                           
2⋅x ⋅sin(9⋅x - 1)   ⎛   2    ⎞     ⎛     3    ⎞
───────────────── - ⎝6⋅x  + 7⎠⋅tanh⎝- 4⋅x  - 7⎠
        9                                      

   4           3 ⎛         2                   ⎞    2 ⎛  2⋅sin(1)            ⎞
9⋅x ⋅sin(1) + x ⋅⎝- 28⋅tanh (7) + 2⋅cos(1) + 28⎠ + x ⋅⎜- ──────── + 6⋅tanh(7)⎟
                                                      ⎝     9                ⎠

            
 + 7⋅tanh(7)
            


6000


   2     ⎛   2    ⎞       ⎛     2    ⎞
2⋅x ⋅cosh⎝2⋅x  - 8⎠   tanh⎝- 6⋅x  - 7⎠
─────────────────── + ────────────────
         3                   ⎛8⋅x⎞    
                         tanh⎜───⎟    
                             ⎝ 9 ⎠    

     4              ⎛         3                                2              
  4⋅x ⋅sinh(8)    3 ⎜  81⋅tanh (7)                      16⋅tanh (7)   2658229⋅
- ──────────── + x ⋅⎜- ─────────── - 1.77777777777778 + ─────────── + ────────
       3            ⎝       2                                9             656

       ⎞      2             ⎛                           2   ⎞            
tanh(7)⎟   2⋅x ⋅cosh(8)     ⎜        8⋅tanh(7)   27⋅tanh (7)⎟   9⋅tanh(7)
───────⎟ + ──────────── + x⋅⎜-6.75 - ───────── + ───────────⎟ - ─────────
10     ⎠        3           ⎝            27           4     ⎠      8⋅x   


8000


              ⎛   3⎞
⎛   3    ⎞    ⎜8⋅x ⎟
⎝9⋅x  + 6⎠⋅cos⎜────⎟
              ⎝ 7  ⎠
────────────────────
      ⎛   3    ⎞    
   sin⎝2⋅x  - 3⎠    

 3 ⎛    9      12⋅cos(3)⎞     6   
x ⋅⎜- ────── - ─────────⎟ - ──────
   ⎜  sin(3)       2    ⎟   sin(3)
   ⎝            sin (3) ⎠         


	(9*x**3+6)*cos(-8*x**3/7)/sin(2*x**3-3)

	-6/sin(3)+x**3*(-9/sin(3)-12*cos(3)/sin(3)**2)



### Check data preparer

In [5]:
fractions=[8/10, 9/10]
db = sq.Seq2SeqDataPreparer(inputs, targets, fractions)
print(db)

IndexError: list assignment index out of range

In [None]:
db.x_train()

In [60]:
sizes = [(a[0], b[0]) for a, b in zip(db.y_train, db.x_train)]

In [61]:
N = len(sizes)
B = 150
K = int(N / B)
K, N, K*B 

(53, 8000, 7950)

In [66]:
batches = []
for k in range(K):
    i = k*B
    if k < K - 1:
        j = i + B - 1
    else:
        j = N - 1
    imin, jmin = sizes[i]
    imax, jmax = sizes[j]
    print('%5d\t%5d\t%5d\t%5d %5d\t%5d %5d' % (k, i, j, 
                                       imin, imax, jmin, jmax))
    #if k > 5: break

    0	    0	  149	    3     9	   10    12
    1	  150	  299	    9    10	   12    13
    2	  300	  449	   10    11	   13    14
    3	  450	  599	   11    14	   14    15
    4	  600	  749	   14    17	   15    15
    5	  750	  899	   17    18	   15    16
    6	  900	 1049	   18    20	   16    16
    7	 1050	 1199	   20    22	   16    16
    8	 1200	 1349	   22    24	   16    17
    9	 1350	 1499	   24    25	   17    17
   10	 1500	 1649	   25    27	   17    19
   11	 1650	 1799	   27    30	   19    22
   12	 1800	 1949	   30    32	   22    23
   13	 1950	 2099	   32    34	   23    24
   14	 2100	 2249	   34    36	   24    24
   15	 2250	 2399	   36    38	   24    25
   16	 2400	 2549	   38    39	   25    25
   17	 2550	 2699	   39    40	   25    26
   18	 2700	 2849	   41    42	   26    27
   19	 2850	 2999	   42    44	   27    27
   20	 3000	 3149	   44    47	   27    27
   21	 3150	 3299	   47    49	   27    28
   22	 3300	 3449	   49    52	   28    28
   23	 3450	 3599	   52    55	   2

In [50]:
m = np.arange(K, dtype=np.long)
m

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52])

In [54]:
ii = np.random.choice(m, 20)
ii

array([14, 14, 29,  1, 28, 27,  6, 33, 29, 19, 31, 46, 47,  9, 26, 48, 50,
       10, 27, 10])

In [55]:
nn = m[ii]
nn

array([14, 14, 29,  1, 28, 27,  6, 33, 29, 19, 31, 46, 47,  9, 26, 48, 50,
       10, 27, 10])

### Check data loader and its reset() method

In [6]:
N_TRAIN = 100
BATCH_SIZE = 10
train_loader = sq.Seq2SeqDataLoader(db.x_train[:,:N_TRAIN], 
                                    db.y_train[:,:N_TRAIN], 
                                    BATCH_SIZE)

In [7]:
train_loader.reset()
for i, (X, Y) in enumerate(train_loader):
    print(X.shape)
    print(X[:5,:])
    if i >= 1: break

torch.Size([81, 10])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [28,  3,  3, 28, 28,  7,  7, 24,  7,  3],
        [23, 14, 14, 23, 23, 21,  3, 25, 21, 10],
        [25,  5,  5, 25, 25, 30, 16,  3, 30,  5],
        [22, 30, 30,  3, 22, 27,  5, 15, 27, 30]])
torch.Size([81, 10])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 3,  3,  7,  7,  7, 28, 28,  7,  7,  7],
        [17, 14,  3,  3,  3, 23, 23,  3, 20,  3],
        [ 5,  5, 12, 14, 10, 25, 25, 11, 26, 13],
        [30, 30,  5,  5,  5,  3, 22,  5, 28,  5]])


In [8]:
train_loader.reset()
for i, (X, Y) in enumerate(train_loader):
    print(X.shape)
    print(X[:5,:])
    if i >= 1: break

torch.Size([81, 10])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [28,  3,  3, 28, 28,  7,  7, 24,  7,  3],
        [23, 14, 14, 23, 23, 21,  3, 25, 21, 10],
        [25,  5,  5, 25, 25, 30, 16,  3, 30,  5],
        [22, 30, 30,  3, 22, 27,  5, 15, 27, 30]])
torch.Size([81, 10])
tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 3,  3,  7,  7,  7, 28, 28,  7,  7,  7],
        [17, 14,  3,  3,  3, 23, 23,  3, 20,  3],
        [ 5,  5, 12, 14, 10, 25, 25, 11, 26, 13],
        [30, 30,  5,  5,  5,  3, 22,  5, 28,  5]])
