# Creating DataSet

In [161]:
import spacy
import os,random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.utils import pad_sequences
from pickle import dump,load
# from tensorflow.keras.applications.vgg16 import VGG16
# from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential,Model
from keras.layers import LSTM,Bidirectional,Dense,Embedding,GRU,Input,Conv2D,MaxPool2D,Dropout,Flatten

In [162]:
dataSet = pd.DataFrame()

In [163]:
results,evalResults,symbols,dataLength = [],[],['*','/','-','+'],10000
startText,EndText = '<START>','<END>'
for i in range(0,dataLength):
    text = f'{i} {random.choice(symbols)} {random.randint(0,dataLength)}'
    try:
        evaluation = str(round(float(eval(text)),3))
    except ZeroDivisionError:
        evaluation = 'ZeroDivisionError'
    finally:
        results.append(text)
        evalResults.append(evaluation)
for i in range(0,500):
    text = f'{i} / 0'
    try:
        evaluation = str(round(float(eval(text)),3))
    except ZeroDivisionError:
        evaluation = 'ZeroDivisionError'
    finally:
        results.append(text)
        evalResults.append(evaluation)
dataSet['xFeatures'] = results
dataSet['yLabels'] = evalResults
dataSet = dataSet.sample(frac=1,random_state=42).reset_index()

In [164]:
dataSet['yLabels'].value_counts()

ZeroDivisionError    500
0.135                  5
0.009                  5
1.15                   5
0.266                  5
                    ... 
25614720.0             1
0.991                  1
12050.0                1
3645588.0              1
28323920.0             1
Name: yLabels, Length: 8773, dtype: int64

In [165]:
dataSet.head()

Unnamed: 0,index,xFeatures,yLabels
0,5118,5118 / 5018,1.02
1,8931,8931 + 1627,10558.0
2,8515,8515 / 622,13.69
3,7282,7282 * 6869,50020058.0
4,7623,7623 / 188,40.548


In [166]:
def convertToChar(inputT,outputT):
    for index,(xitem,yitem) in enumerate(zip(inputT,outputT)):
        inputT[index] = [char for char in xitem]
        outputT[index] = [startText] + [char for char in yitem] + [EndText]
    return inputT,outputT

In [167]:
encoderInput,decoderInput = convertToChar(dataSet.xFeatures,dataSet.yLabels)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputT[index] = [char for char in xitem]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outputT[index] = [startText] + [char for char in yitem] + [EndText]


In [168]:
encoderInput,decoderInput

(0        [5, 1, 1, 8,  , /,  , 5, 0, 1, 8]
 1        [8, 9, 3, 1,  , +,  , 1, 6, 2, 7]
 2           [8, 5, 1, 5,  , /,  , 6, 2, 2]
 3        [7, 2, 8, 2,  , *,  , 6, 8, 6, 9]
 4           [7, 6, 2, 3,  , /,  , 1, 8, 8]
                        ...                
 10495          [5, 7, 3, 4,  , *,  , 7, 3]
 10496       [5, 1, 9, 1,  , /,  , 7, 0, 2]
 10497    [5, 3, 9, 0,  , -,  , 7, 4, 4, 1]
 10498       [8, 6, 0,  , +,  , 8, 7, 4, 9]
 10499    [7, 2, 7, 0,  , *,  , 3, 8, 9, 6]
 Name: xFeatures, Length: 10500, dtype: object,
 0                          [<START>, 1, ., 0, 2, <END>]
 1                 [<START>, 1, 0, 5, 5, 8, ., 0, <END>]
 2                       [<START>, 1, 3, ., 6, 9, <END>]
 3        [<START>, 5, 0, 0, 2, 0, 0, 5, 8, ., 0, <END>]
 4                    [<START>, 4, 0, ., 5, 4, 8, <END>]
                               ...                      
 10495          [<START>, 4, 1, 8, 5, 8, 2, ., 0, <END>]
 10496                   [<START>, 7, ., 3, 9, 5, <END>]
 10497      

In [169]:
tokenizer = Tokenizer(lower=False)

In [170]:
tokenizer.fit_on_texts(list(encoderInput) + list(decoderInput))

In [171]:
vocabLength = len(tokenizer.word_index) + 1

In [172]:
encoderInput = tokenizer.texts_to_sequences(encoderInput)
encoderInput

[[7, 3, 3, 10, 1, 16, 1, 7, 2, 3, 10],
 [10, 11, 5, 3, 1, 17, 1, 3, 8, 4, 9],
 [10, 7, 3, 7, 1, 16, 1, 8, 4, 4],
 [9, 4, 10, 4, 1, 18, 1, 8, 10, 8, 11],
 [9, 8, 4, 5, 1, 16, 1, 3, 10, 10],
 [8, 2, 4, 11, 1, 18, 1, 10, 8, 2, 6],
 [5, 5, 4, 11, 1, 17, 1, 11, 5, 4, 5],
 [10, 11, 5, 6, 1, 16, 1, 5, 10, 2],
 [5, 10, 9, 11, 1, 16, 1, 8, 7, 4, 5],
 [5, 9, 10, 3, 1, 17, 1, 6, 3, 4, 7],
 [3, 8, 8, 11, 1, 18, 1, 10, 6, 11, 2],
 [4, 3, 2, 10, 1, 18, 1, 9, 5, 5, 6],
 [5, 2, 5, 9, 1, 17, 1, 11, 7, 5, 6],
 [11, 8, 4, 7, 1, 17, 1, 5, 8, 3, 3],
 [5, 7, 6, 2, 1, 16, 1, 11, 5, 10],
 [10, 11, 8, 4, 1, 18, 1, 10, 2, 5, 8],
 [7, 5, 10, 11, 1, 17, 1, 10, 10, 10, 5],
 [11, 4, 11, 8, 1, 16, 1, 5, 3, 11, 6],
 [3, 4, 9, 9, 1, 16, 1, 11, 5, 8],
 [5, 7, 4, 11, 1, 15, 1, 7, 7, 4, 10],
 [3, 10, 10, 2, 1, 16, 1, 11, 7, 2, 6],
 [11, 7, 8, 11, 1, 17, 1, 9, 2, 4, 10],
 [5, 2, 7, 11, 1, 18, 1, 3, 2, 7, 2],
 [3, 11, 7, 9, 1, 18, 1, 9, 6, 7, 10],
 [3, 3, 11, 6, 1, 15, 1, 7, 9, 5],
 [6, 8, 5, 11, 1, 18, 1, 7, 4, 9, 5],
 [8

In [173]:
decoderInput = tokenizer.texts_to_sequences(decoderInput)
decoderInput

[[12, 3, 14, 2, 4, 13],
 [12, 3, 2, 7, 7, 10, 14, 2, 13],
 [12, 3, 5, 14, 8, 11, 13],
 [12, 7, 2, 2, 4, 2, 2, 7, 10, 14, 2, 13],
 [12, 6, 2, 14, 7, 6, 10, 13],
 [12, 7, 3, 10, 9, 5, 7, 3, 8, 14, 2, 13],
 [12, 3, 4, 8, 7, 4, 14, 2, 13],
 [12, 4, 5, 14, 7, 3, 3, 13],
 [12, 2, 14, 7, 11, 7, 13],
 [12, 9, 11, 2, 8, 14, 2, 13],
 [12, 3, 6, 3, 8, 11, 10, 3, 2, 14, 2, 13],
 [12, 3, 7, 6, 8, 2, 2, 9, 4, 14, 2, 13],
 [12, 3, 4, 7, 9, 3, 14, 2, 13],
 [12, 3, 5, 4, 5, 8, 14, 2, 13],
 [12, 5, 14, 9, 9, 6, 13],
 [12, 9, 4, 2, 3, 10, 8, 5, 4, 14, 2, 13],
 [12, 3, 6, 4, 9, 4, 14, 2, 13],
 [12, 4, 14, 11, 3, 13],
 [12, 3, 14, 5, 8, 6, 13],
 [12, 15, 3, 11, 11, 11, 14, 2, 13],
 [12, 2, 14, 3, 11, 10, 13],
 [12, 3, 8, 7, 11, 9, 14, 2, 13],
 [12, 5, 4, 3, 3, 11, 7, 2, 14, 2, 13],
 [12, 3, 6, 7, 11, 7, 5, 2, 8, 14, 2, 13],
 [12, 8, 4, 3, 14, 2, 13],
 [12, 4, 6, 6, 8, 3, 6, 6, 9, 14, 2, 13],
 [12, 2, 14, 10, 9, 5, 13],
 [12, 22, 23, 19, 20, 24, 21, 25, 21, 26, 21, 20, 27, 28, 19, 19, 20, 19, 13],
 [12, 2, 

In [174]:
encoderInputData = pad_sequences(encoderInput,maxlen=vocabLength,padding='post')
decoderInputData = np.array(pad_sequences(decoderInput,maxlen=vocabLength,padding='post'),dtype=np.uint32)
decoderOutputData = pad_sequences(decoderInputData[:,1:],maxlen=vocabLength,padding='post')

In [175]:
seqLength = len(encoderInputData[0])
seqLength,vocabLength

(29, 29)

In [176]:
encoderInputData,decoderInputData,decoderOutputData

(array([[ 7,  3,  3, ...,  0,  0,  0],
        [10, 11,  5, ...,  0,  0,  0],
        [10,  7,  3, ...,  0,  0,  0],
        ...,
        [ 7,  5, 11, ...,  0,  0,  0],
        [10,  8,  2, ...,  0,  0,  0],
        [ 9,  4,  9, ...,  0,  0,  0]]),
 array([[12,  3, 14, ...,  0,  0,  0],
        [12,  3,  2, ...,  0,  0,  0],
        [12,  3,  5, ...,  0,  0,  0],
        ...,
        [12, 15,  4, ...,  0,  0,  0],
        [12, 11,  8, ...,  0,  0,  0],
        [12,  4, 10, ...,  0,  0,  0]], dtype=uint32),
 array([[ 3, 14,  2, ...,  0,  0,  0],
        [ 3,  2,  7, ...,  0,  0,  0],
        [ 3,  5, 14, ...,  0,  0,  0],
        ...,
        [15,  4,  2, ...,  0,  0,  0],
        [11,  8,  2, ...,  0,  0,  0],
        [ 4, 10,  5, ...,  0,  0,  0]]))

In [177]:
encoderInputData.shape,decoderInputData.shape,decoderOutputData.shape

((10500, 29), (10500, 29), (10500, 29))

# building model

In [178]:
def encoder(vocabSize,seqLength):
    encoderInput = Input(shape=(None,),name='encoderInput')
    net = Embedding(vocabSize,seqLength)(encoderInput)
    net = Bidirectional(GRU(250,return_sequences=True))(net)
    net = Bidirectional(GRU(250,return_sequences=True))(net)
    net = Bidirectional(GRU(250,return_sequences=True))(net)
    net = GRU(100)(net)
    net = Dropout(0.1)(net)
    net = Dense(250,activation='tanh',name='encoderOutput')(net)
    encoderOutput = net
    return encoderInput,encoderOutput

In [179]:
def decoder(initState,vocabSize,seqLength):
    decoderInput = Input(shape=(None,),name='decoderInput')
    net = Embedding(vocabSize,seqLength)(decoderInput)
    net = GRU(250,return_sequences=True)(net,initial_state=initState)
    net = GRU(250,return_sequences=True)(net,initial_state=initState)
    net = GRU(250,return_sequences=True)(net,initial_state=initState)
    net = GRU(250)(net,initial_state=initState)
    net = Dropout(0.1)(net)
    net = Dense(vocabSize,activation='softmax',name='decoderOutput')(net)
    decoderOutput = net
    return decoderInput,decoderOutput

In [180]:
encoderInput,encoderOutput = encoder(vocabLength,seqLength)
encoderModel = Model(inputs=[encoderInput],
                    outputs=[encoderOutput])
decoderInitialState = encoderOutput
encoderModel.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoderInput (InputLayer)   [(None, None)]            0         
                                                                 
 embedding_6 (Embedding)     (None, None, 29)          841       
                                                                 
 bidirectional_6 (Bidirectio  (None, None, 500)        421500    
 nal)                                                            
                                                                 
 bidirectional_7 (Bidirectio  (None, None, 500)        1128000   
 nal)                                                            
                                                                 
 bidirectional_8 (Bidirectio  (None, None, 500)        1128000   
 nal)                                                            
                                                           

In [181]:
decoderInput,decoderOutput = decoder(decoderInitialState,vocabLength,seqLength)
# decoderModel = Model(inputs=[decoderInput],
#                     outputs=[decoderOutput])
# decoderModel.summary()

In [182]:
model = Model(inputs=[encoderInput,decoderInput],
                    outputs=[decoderOutput])
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoderInput (InputLayer)      [(None, None)]       0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, None, 29)     841         ['encoderInput[0][0]']           
                                                                                                  
 bidirectional_6 (Bidirectional  (None, None, 500)   421500      ['embedding_6[0][0]']            
 )                                                                                                
                                                                                                  
 bidirectional_7 (Bidirectional  (None, None, 500)   1128000     ['bidirectional_6[0][0]']  

In [183]:
model.compile(
             optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy']
             )

In [184]:
xFeatures = {
    'encoderInput':encoderInputData,
    'decoderInput':decoderInputData
}

yLabels = {
    'decoderOutput':decoderOutputData
}

In [185]:
xFeatures

{'encoderInput': array([[ 7,  3,  3, ...,  0,  0,  0],
        [10, 11,  5, ...,  0,  0,  0],
        [10,  7,  3, ...,  0,  0,  0],
        ...,
        [ 7,  5, 11, ...,  0,  0,  0],
        [10,  8,  2, ...,  0,  0,  0],
        [ 9,  4,  9, ...,  0,  0,  0]]),
 'decoderInput': array([[12,  3, 14, ...,  0,  0,  0],
        [12,  3,  2, ...,  0,  0,  0],
        [12,  3,  5, ...,  0,  0,  0],
        ...,
        [12, 15,  4, ...,  0,  0,  0],
        [12, 11,  8, ...,  0,  0,  0],
        [12,  4, 10, ...,  0,  0,  0]], dtype=uint32)}

In [186]:
model.fit(xFeatures,
          yLabels,
          batch_size=32,
          epochs=10,
          verbose=True,
          workers=-1,
          use_multiprocessing=True
         )

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 