# seq2seq: Generate Data

Abdulhakim Alnuqaydan, Ali Kadhim, Sergei Gleyzer, Harrison Prosper

July 2021

## Description

Generate random symbol mathematical expressions $f(x)$ and use __sympy__ to find their Taylor series expansions to ${\cal O}(x^5)$.

In [1]:
import sympy as sp
import numpy as np
import random as rn
from sympy import exp, \
    cos, sin, tan, \
    cosh, sinh, tanh, ln

from IPython.display import display
    
# enable pretty printing of equations
sp.init_printing(use_latex='mathjax')

In [2]:
from google.colab import drive 
drive.mount('/content/gdrive') 

Mounted at /content/gdrive


In [3]:
BASE = '/content/gdrive/My Drive/AI'

In [4]:
v = ['x', 'x**2', 'x**3']
s = ['+', '-', '/']
n = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
f = ['exp', 'exp', 'exp',
     'sin', 'cos', 'tan', 
     'sinh','cosh','tanh',
     'ln']
x = sp.Symbol('x')

In [5]:
def pprint(expr):
    display(eval(expr))

In [6]:
def generate_arg():
    a  = n[r.randint(1,len(n)-1)] + '*'
    a += v[r.randint(0,len(v)-1)]
    c  = s[r.randint(0,len(s)-1)]
    k  = n[r.randint(0,len(n)-1)]
    a += '' if c == '/' and k == '1' else c + k
    return a

In [7]:
def data_generator(N):
    datafile = "%s/data/seq2seq_data.txt" % BASE
    X = []
    Y = []
    m = 0
    for n in range(N):
        expr = ''
        K = r.randint(1,3)
        for k in range(K):
            a = generate_arg()
            i = r.randint(0, len(f)-1)
            j = r.randint(0,1)
            if j == 0:
                b = '%s(-%s)' % (f[i], a)
            else:
                b = '%s(%s)' % (f[i], a)
    
            i = r.randint(0, len(s)-1)
            j = r.randint(0,1)
            if j == 0:
                a = '(%s)*' % generate_arg()
            else:
                a = ''
            expr = expr + s[i] + a + b
        if expr[0] in ['+', '/']:
            expr = expr[1:]
            
        # expand in Taylor series
        try:
            y_expr = str(sp.series(expr, x, n=5))
            y_expr = y_expr.replace(' + O(x**5)', '')
        except:
            continue

        open(datafile, 'a').write('%s\t%s\n' % (expr, y_expr))
        if m % 500 == 0:
            print('%10d\t%10d' % (n, m))
        m += 1
        

In [8]:
N = 40000
data_generator(N)
        

         0	         0
       500	       500
      1007	      1000
      1509	      1500
      2010	      2000
      2512	      2500
      3019	      3000
      3521	      3500
      4022	      4000
      4524	      4500
      5026	      5000
      5529	      5500
      6033	      6000
      6534	      6500
      7038	      7000
      7541	      7500
      8045	      8000
      8546	      8500
      9050	      9000
      9555	      9500
     10058	     10000
     10562	     10500
     11065	     11000
     11566	     11500
     12071	     12000
     12574	     12500
     13078	     13000
     13581	     13500
     14084	     14000
     14587	     14500
     15092	     15000
     15593	     15500
     16097	     16000
     16600	     16500
     17103	     17000
     17608	     17500
     18115	     18000
     18621	     18500
     19124	     19000
     19626	     19500
     20129	     20000
     20633	     20500
     21135	     21000
     21640	     21500
     22141	     22000
     22642