# Model transforming words into a number
- input: sequence of letters
- output: number 


Examples: 
- input: sequence 'sto' output: 100
- input: sequence 'dwieście czternaście' output: 214


In [1]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
import numpy as np
from kwotaslownie import lslownie

## Model

In [2]:
model = Sequential()
model.add(LSTM(128,input_shape=(None,1),return_sequences=True)) # sequences of singlen numbers
model.add(LSTM(128))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer="adam",metrics=['mae','mse'])
num_epochs = 0
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, None, 128)         66560     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 198,273
Trainable params: 198,273
Non-trainable params: 0
_________________________________________________________________


## Dataset creation

### Helper methods

In [3]:
# helper method, converts sequence of numbers to text
def to_text(sample):
    return ''.join([idx2char[int(x)] for x in sample])
# helper method, converts text to sequence of numbers
def to_number(words):
    return np.array([char2idx[char] for char in words])

### Dataset - **samples** and **labels**

In [4]:
DATASET_SIZE=200

samples = []
labels = []
all_words = ''
max_len = 0
for i in range(DATASET_SIZE):
    labels.append(i)
    words = lslownie(i)
    samples.append(words)
    all_words += words
    if len(words)>max_len: 
        max_len = len(words)
  
print('Max len of text',max_len)
vocab = sorted(set(all_words))
vocab_size = len(vocab)
print('vocabulary (used letters)',vocab)
print ('unique characters',vocab_size)

Max len of text 29
vocabulary (used letters) [' ', 'a', 'c', 'd', 'e', 'i', 'j', 'm', 'n', 'o', 'p', 'r', 's', 't', 'w', 'y', 'z', 'ą', 'ć', 'ę', 'ś']
unique characters 21


#### Creating a mapping from unique characters to indices

In [5]:
char2idx = {char:index for index, char in enumerate(vocab)}
print('char2idx:\n',char2idx)
idx2char = np.array(vocab)
print('idx2char\n',idx2char)

char2idx:
 {' ': 0, 'a': 1, 'c': 2, 'd': 3, 'e': 4, 'i': 5, 'j': 6, 'm': 7, 'n': 8, 'o': 9, 'p': 10, 'r': 11, 's': 12, 't': 13, 'w': 14, 'y': 15, 'z': 16, 'ą': 17, 'ć': 18, 'ę': 19, 'ś': 20}
idx2char
 [' ' 'a' 'c' 'd' 'e' 'i' 'j' 'm' 'n' 'o' 'p' 'r' 's' 't' 'w' 'y' 'z' 'ą'
 'ć' 'ę' 'ś']


#### Convert letters to numbers using char2idx

In [6]:
samples_int = []
for s in samples:
    v = np.array([char2idx[char] for char in s])
    samples_int.append(v) # different sizes!
print(samples[123],' ->becomes-> ',samples_int[123])

sto dwadzieścia trzy  ->becomes->  [12 13  9  0  3 14  1  3 16  5  4 20  2  5  1  0 13 11 16 15]


#### From list of lists to numpy - must have a fixed number of characters (30 -> max_len)

In [7]:
samples = np.zeros((DATASET_SIZE,max_len))
for i in range(len(samples_int)):
    for j in range(len(samples_int[i])):
        samples[i,j] = np.array(samples_int[i][j]) # all not used have '0' which is ' '
print('SAMPLES\n\n',samples)
print(samples.shape)

SAMPLES

 [[16.  4. 11. ...  0.  0.  0.]
 [ 6.  4.  3. ...  0.  0.  0.]
 [ 3. 14.  1. ...  0.  0.  0.]
 ...
 [12. 13.  9. ...  7.  0.  0.]
 [12. 13.  9. ...  0.  0.  0.]
 [12. 13.  9. ...  5. 19. 18.]]
(200, 29)


In [8]:
samples = np.expand_dims(samples,axis=2) #add the third dimension
labels = np.array(labels,dtype=float)

print("Sample (for 123):\n",samples[123])
print("Sample decoded",to_text(samples[123]))
print("Label (output):",labels[123])

print('samples shape',samples.shape)
print('labels shape',labels.shape)

Sample (for 123):
 [[12.]
 [13.]
 [ 9.]
 [ 0.]
 [ 3.]
 [14.]
 [ 1.]
 [ 3.]
 [16.]
 [ 5.]
 [ 4.]
 [20.]
 [ 2.]
 [ 5.]
 [ 1.]
 [ 0.]
 [13.]
 [11.]
 [16.]
 [15.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]
Sample decoded sto dwadzieścia trzy         
Label (output): 123.0
samples shape (200, 29, 1)
labels shape (200,)


In [9]:
TRAINING_SIZE = .5
from sklearn.model_selection import train_test_split
(trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels,train_size=TRAINING_SIZE, random_state=1)
print('Training samples:',len(trainSamples),' test samples',len(testSamples))

Training samples: 100  test samples 100


In [13]:
EPOCHS=100
BATCH_SIZE = int(len(trainSamples)/4)
print('Training with',len(trainSamples),'samples',EPOCHS,'epochs and batch_size=',BATCH_SIZE)
for x in range(10):
    H = model.fit(trainSamples, trainLabels, epochs=EPOCHS,verbose=0,batch_size=BATCH_SIZE)
    num_epochs += EPOCHS
    print("\n{}/10 Epochs: {} - loss={:6.3f}, loss improvement={:6.3f}".
          format(x, num_epochs,H.history['loss'][-1], H.history['loss'][0]-H.history['loss'][-1]))
    check_model()
print("Done")

Training with 100 samples 100 epochs and batch_size= 25

0/10 Epochs: 300 - loss=3106.115, loss improvement=141.214
text => [predicted value] error=[error]
sto czterdzieści pięć         =>  92.81 error = 52.19
sto sześćdziesiąt dwa         =>  92.81 error = 69.19
osiemdziesiąt dziewięć        =>  92.81 error = 3.81
sto dziewięćdziesiąt cztery   =>  92.81 error = 101.19
sto jedenaście                =>  92.81 error = 18.19
Mean error = 50.222916

1/10 Epochs: 400 - loss=3098.412, loss improvement= 7.620
text => [predicted value] error=[error]
sto sześćdziesiąt trzy        =>  94.98 error = 68.02
trzydzieści dwa               =>  94.92 error = 62.92
sto pięćdziesiąt              =>  94.93 error = 55.07
sto dziewięćdziesiąt dwa      =>  94.98 error = 97.02
sto sześćdziesiąt dziewięć    =>  94.98 error = 74.02
Mean error = 50.092445

2/10 Epochs: 500 - loss=1454.433, loss improvement=1642.968
text => [predicted value] error=[error]
czterdzieści                  =>  32.60 error = 7.40
sto c

In [14]:
import random

def check_model(verbose=0,how_many=5):
    pred = model.predict(samples)
    print('text => [predicted value] error=[error]')
    error = []
    for i in range(len(pred)):
        res = samples[i]
        error.append(abs(i-pred[i]))
        if verbose==1:
            train = ''
            if i in trainLabels: train='[T]'
            print(i,to_text(res),'=> {:.2f} error = {:.2f}'.format(pred[i,0],abs(i-pred[i,0])),train)
    if verbose<1: # if not verbose just display 'how_many' random samples
        for i in range(how_many):        
            x = random.randrange(DATASET_SIZE)
            res = samples[x]
            print(to_text(res),'=>  {:.2f} error = {:.2f}'.format(pred[x,0],abs(x-pred[x,0])))      
    print('Mean error =',np.mean(error))        
    return np.mean(error)
check_model(1)

text => [predicted value] error=[error]
0 zero                          => 0.83 error = 0.83 [T]
1 jeden                         => 1.92 error = 0.92 [T]
2 dwa                           => 1.47 error = 0.53 [T]
3 trzy                          => 2.94 error = 0.06 [T]
4 cztery                        => 6.60 error = 2.60 
5 pięć                          => 2.15 error = 2.85 
6 sześć                         => 6.41 error = 0.41 [T]
7 siedem                        => 7.49 error = 0.49 [T]
8 osiem                         => 7.07 error = 0.93 [T]
9 dziewięć                      => 10.04 error = 1.04 [T]
10 dziesięć                      => 10.04 error = 0.04 [T]
11 jedenaście                    => 21.58 error = 10.58 
12 dwanaście                     => 9.73 error = 2.27 
13 trzynaście                    => 19.36 error = 6.36 
14 czternaście                   => 25.05 error = 11.05 
15 piętnaście                    => 16.03 error = 1.03 [T]
16 szesnaście                    => 20.37 error = 4.

3.6136725

In [24]:

x = to_number('sto dsjsdj sdfsd') 
#              xxxxxxxxxxxxxxxxxxxxxxxxxxxxx
x = np.expand_dims(x,axis=1)
x = np.expand_dims(x,axis=0)
model.predict(x)

KeyError: 'f'

In [None]:
model.save('model_words2numbers.h5')