## Imports 

In [1]:
import spacy
import os,random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from pickle import dump,load
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM,Bidirectional,Dense,Embedding,GRU,Input,Conv2D,MaxPool2D,Dropout,Flatten

# model

In [2]:
def encoder(vocabSize,seqLength):
    encoderInput = Input(shape=(None,),name='encoderInput')
    net = Embedding(vocabSize,seqLength)(encoderInput)
    net = Bidirectional(LSTM(250,return_sequences=True))(net)
    net = Bidirectional(LSTM(250,return_sequences=True))(net)
    net = Bidirectional(LSTM(250,return_sequences=False))(net)
    net = Dense(250,activation='tanh')(net)
    encoderOutput = net
    return encoderInput,encoderOutput

In [3]:
def decoder(initState,vocabSize,seqLength):
    decoderInput = Input(shape=(None,),name='decoderInput')
    net = Embedding(vocabSize,seqLength)(decoderInput)
    net = GRU(250,return_sequences=True)(net,initial_state=initState)
    net = GRU(250,return_sequences=True)(net,initial_state=initState)
    net = GRU(250,return_sequences=True)(net,initial_state=initState)
    net = Dropout(0.1)(net)
    net = Dense(vocabSize,activation='softmax',name='decoderOutput')(net)
    decoderOutput = net
    return decoderInput,decoderOutput

# Data Preprocessing

In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
tokenizer = Tokenizer(lower='False')

In [6]:
start,end = '<start>','<end>'

In [7]:
data = pd.read_csv('wordNumber.csv')[:10000]

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,nums,words
0,0,8050006,"eight million, fifty thousand and six"
1,1,75515,"seventy-five thousand, five hundred and fifteen"
2,2,9285557,"nine million, two hundred and eighty-five thou..."
3,3,606467,"six hundred and six thousand, four hundred and..."
4,4,268340,"two hundred and sixty-eight thousand, three hu..."


In [9]:
numbers,words = data.nums,data.words

In [10]:
numbers = [[char for char in str(word)] for word in numbers]

In [11]:
numbers[0]

['8', '0', '5', '0', '0', '0', '6']

In [12]:
words = [[start] + [str(char) for char in nlp(word)] + [end] for word in words]

In [13]:
words[:2]

[['<start>',
  'eight',
  'million',
  ',',
  'fifty',
  'thousand',
  'and',
  'six',
  '<end>'],
 ['<start>',
  'seventy',
  '-',
  'five',
  'thousand',
  ',',
  'five',
  'hundred',
  'and',
  'fifteen',
  '<end>']]

In [14]:
tokenizer.fit_on_texts(numbers + words)

In [15]:
vocabLength = len(tokenizer.word_index) + 1
seqLength = vocabLength 

In [16]:
tokenizer.word_index

{'and': 1,
 'hundred': 2,
 ',': 3,
 '-': 4,
 '<start>': 5,
 '<end>': 6,
 'thousand': 7,
 'million': 8,
 '3': 9,
 '6': 10,
 '2': 11,
 '5': 12,
 '1': 13,
 '4': 14,
 '9': 15,
 '7': 16,
 '8': 17,
 '0': 18,
 'one': 19,
 'five': 20,
 'six': 21,
 'three': 22,
 'two': 23,
 'seven': 24,
 'four': 25,
 'nine': 26,
 'eight': 27,
 'thirty': 28,
 'sixty': 29,
 'twenty': 30,
 'fifty': 31,
 'eighty': 32,
 'forty': 33,
 'ninety': 34,
 'seventy': 35,
 'twelve': 36,
 'fifteen': 37,
 'ten': 38,
 'eleven': 39,
 'nineteen': 40,
 'sixteen': 41,
 'thirteen': 42,
 'fourteen': 43,
 'eighteen': 44,
 'seventeen': 45}

In [17]:
del data

In [18]:
encoderInputData = tokenizer.texts_to_sequences(numbers)
decoderInputData = tokenizer.texts_to_sequences([i[:-1] for i in words])
decoderOuputData = tokenizer.texts_to_sequences([i[1:] for i in words])

In [20]:
encoderInputData = pad_sequences(numbers,maxlen=vocabLength,padding='post')
decoderInputData = pad_sequences(decoderInputData,maxlen=vocabLength,padding='post')
decoderOutputData = pad_sequences(decoderOuputData,maxlen=vocabLength,padding='post')

In [21]:
del words
del numbers

In [26]:
decoderOutputData

array([[27,  8,  3, ...,  0,  0,  0],
       [35,  4, 20, ...,  0,  0,  0],
       [26,  8,  3, ...,  0,  0,  0],
       ...,
       [24,  8,  3, ...,  0,  0,  0],
       [23,  2,  1, ...,  0,  0,  0],
       [27,  8,  3, ...,  0,  0,  0]])

In [27]:
xFeatures = {
    'encoderInput': encoderInputData,
    'decoderInput': decoderInputData
}
yLabels = {
    'decoderOutput': to_categorical(decoderOutputData,num_classes=vocabLength)
}

In [None]:
# del encoderInputData
# del decoderInputData
# del decoderOuputData

In [28]:
encoderInput,encoderOutput = encoder(vocabLength,seqLength)
# encoderModel = Model(inputs=[encoderInput],
#                     outputs=[encoderOutput])
decoderInitialState = encoderOutput
# encoderModel.summary()

In [29]:
decoderInput,decoderOutput = decoder(decoderInitialState,vocabLength,seqLength)
# decoderModel = Model(inputs=[decoderInput],
#                     outputs=[decoderOutput])
# decoderModel.summary()

In [30]:
model = Model(inputs=[encoderInput,decoderInput],
                    outputs=[decoderOutput])
model.compile(
             optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy']
             )

In [None]:
model.fit(
          xFeatures,
          yLabels,
          batch_size=32,
          epochs=5,
          verbose=True,
          workers=-1,
          use_multiprocessing=True
         )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
 52/313 [===>..........................] - ETA: 8:18 - loss: 0.2495 - accuracy: 0.9185

In [None]:
xFeatures,yLabels

In [None]:
# idx = random.randint(0,len(xFeatures2['encoderInput']))
# image = xFeatures2['encoderInput'][idx]
# image = np.expand_dims(image,axis=0)
# dinput = tokenizer.texts_to_sequences([[startSeq]])
# dinput = pad_sequences(dinput,maxlen=vocabLength,padding='post',dtype='float32')
# # print(dinput)
# # calcPred = encoderModel.predict(image)
# # for i in calcPred:
# #     print(np.argmax(i))
# #
# x = {
#     'encoderInput':image,
#     'decoderInput':dinput
# }
# # initialState = encoderModel.predict(calc)
# prediction = model.predict(x,verbose=False)
# # #
# results = ''
# for pred in prediction:
#     for i in pred: 
#         index = np.argmax(i)
#         result = tokenizer.index_word[index] if index != 0 else ''
#         results += result
# print('prediction:',results)
# print('image : ')
# plt.imshow(image[0]);