# Model transforming a number into text
- input: number in range(0,DATASE_SIZE)
- output: text

Examples: 
- input: 234, output: dwieście trzydzieści cztery
- input: 6, output: sześć

The file kwotaslownie.py taken from: https://github.com/dowgird/pyliczba


In [1]:
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import RNN, LSTM, RepeatVector
import numpy as np
from kwotaslownie import lslownie

## Model

In [2]:
OUTPUT_SEQUENCE_LEN=30

model = Sequential()
model.add(Dense(16, input_dim=1) ) 
model.add(RepeatVector(OUTPUT_SEQUENCE_LEN)) #length of the text
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(Dense(33,activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer="adam",metrics=['accuracy','mae'])
num_epochs = 0

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                32        
_________________________________________________________________
repeat_vector (RepeatVector) (None, 30, 16)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 30, 128)           74240     
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 128)           131584    
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 128)           131584    
_________________________________________________________________
dense_1 (Dense)              (None, 30, 33)            4257      
Total params: 341,697
Trainable params: 341,697
Non-trainable params: 0
__________________________________________________

## Dataset creation

In [3]:
DATASET_SIZE=200

samples = []
labels = []

import random

for i in range(DATASET_SIZE):
    samples.append(i)
    words = lslownie(i)
    labels.append(list(words))

samples = np.array(samples)
labels = np.array(labels)

print("Sample (input):",samples[0])
print("Label",labels[0])

codes = ' aąbcćdeęfghijklłmnńoóprsśtuwyzżź'

nlabels = np.zeros((DATASET_SIZE,OUTPUT_SEQUENCE_LEN,len(codes)))
for i in range(DATASET_SIZE):
    for j in range(OUTPUT_SEQUENCE_LEN):
        if j>=len(labels[i]): 
                nlabels[i][j][0]=1
                continue
        x = labels[i][j]
        index = codes.index(x)
        nlabels[i][j][index] = 1
print("Label encoded (output):\n",nlabels[123])
labels = nlabels
print(labels.shape)

Sample (input): 0
Label ['z', 'e', 'r', 'o']
Label encoded (output):
 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0

  labels = np.array(labels)


In [4]:
TRAINING_SIZE = .5
from sklearn.model_selection import train_test_split
(trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels,train_size=TRAINING_SIZE)
print('Training samples:',len(trainSamples),' test samples',len(testSamples))

Training samples: 100  test samples 100


In [9]:
EPOCHS=50
BATCH_SIZE = int(len(trainSamples)/2)
print('Training with',len(trainSamples),'samples',EPOCHS,'epochs and batch_size=',BATCH_SIZE)
print("Epochs so far",num_epochs)
for x in range(10):
    H = model.fit(trainSamples, trainLabels, epochs=EPOCHS,verbose=0,batch_size=BATCH_SIZE)
    num_epochs += EPOCHS
    print()
    print("Epoch {} - loss ={:6.3f}, loss improvement ={:6.3f}".
          format(num_epochs,H.history['loss'][-1], H.history['loss'][0]-H.history['loss'][-1]))
    pred = model.predict(samples)
    res = pred.argmax(axis=2)
    c,l,p = check_model()
#    print("accuracy={:6.3f}%".format(100*p))
#     f = open("output.txt", "a")
#     f.write("=================================================================================\n")
#     f.write("{} Epoch {} - loss ={:6.3f}, loss improvement ={:6.3f}\n".
#             format(i,num_epochs,H.history['loss'][-1], H.history['loss'][0]-H.history['loss'][-1]))
#     f.write("accuracy={:6.3f}%\n".format(100*p))
#     for i in range(len(pred)):
#         txt = label2words(res[i])
#         f.write("{} -> {}\n".format(i,txt))
#     f.close()
print("Done")


Training with 100 samples 50 epochs and batch_size= 50
Epochs so far 550

Epoch 600 - loss = 0.605, loss improvement = 0.015
137 -> sto cztddzieści dzieei
129 -> sto dwadzieścia dzeew
105 -> sto dii
27 -> twadzieścia pzećć
178 -> sto sziemdzieesiąt  z
Correct 2 of 200  =  0.01

Epoch 650 - loss = 0.597, loss improvement =-0.011
62 -> sześćdziesąąt
172 -> sto szeećdziesiąt
186 -> sto sziemdziesiąt   e
51 -> czterdzieści szieei ć
128 -> sto dwadzieścia jzee
Correct 2 of 200  =  0.01

Epoch 700 - loss = 0.530, loss improvement = 0.080
6 -> dster
157 -> sto szeććdziesiąt  zee
94 -> ssiewięćdziesiąt czeery
130 -> sto trzddzieści dzeew
177 -> sto sziemmziessiąt  zee
Correct 5 of 200  =  0.025

Epoch 750 - loss = 0.652, loss improvement =-0.057
35 -> trzydzieści dzee
140 -> sto crtydzieści dziew
167 -> sto sześćdziesiąt  zte
92 -> ssiemdziesiąt sziewi
102 -> sto wię             e
Correct 5 of 200  =  0.025

Epoch 800 - loss = 0.543, loss improvement = 0.142
111 -> sto dzienaście
182 -> sto os

In [10]:
def label2words(label):
    s = ''
    for r in label:
        s+=codes[int(r)]
        #print(i,'->',s)
    return s.strip()    
    
def check_model(verbose=0,show_training=1):
    pred = model.predict(samples)
    res = pred.argmax(axis=2)
    correct = 0
    for i in range(len(pred)):
        if(not show_training and i in trainSamples): continue
        train=''
        if i in trainSamples: train='[T]'
        txt = label2words(res[i])
        txt_correct = lslownie(i)
        ok=''
        if(txt==txt_correct): 
            correct+=1
            ok = "[ok]"
        if(verbose==1):
            print(i,'->',txt, ok,train)
    if verbose==0:
        for i in range(5):        
            x = random.randrange(DATASET_SIZE)
            print(x,'->',label2words(res[x]))    
    print('Correct',correct,'of',len(pred),' = ',(correct/len(pred)))
    return correct,len(pred),(correct/len(pred))
check_model(1)

0 -> jero  [T]
1 -> jeden [ok] [T]
2 -> jrden  
3 -> trzy [ok] [T]
4 -> tztery  [T]
5 -> cztery  
6 -> cster  
7 -> osiem  
8 -> osiem [ok] [T]
9 -> dsiem  
10 -> dsienaści  
11 -> dwdenaście  [T]
12 -> dwanaście [ok] [T]
13 -> deanaście  
14 -> deennaście  
15 -> deeennaście  
16 -> diedemaaścee  
17 -> diedemnaście  [T]
18 -> dwedeeenście  
19 -> dwadzieścia  
20 -> dwadzieścia [ok] [T]
21 -> dwadzieścia  
22 -> dwadzieścia trzy  
23 -> dwadzieścia trzy [ok] [T]
24 -> dwadzieścia cztery [ok] [T]
25 -> dwadzieścia pięćć  [T]
26 -> dwadzieścia sześć [ok] [T]
27 -> dwadzieścia dzeewięć  
28 -> twadzieścia dzeewięć  
29 -> twadzieścia dziewięć  [T]
30 -> trzydzieści    e  [T]
31 -> trzydzieści de e  [T]
32 -> trzydzieści dway  [T]
33 -> trzydzieści tray  [T]
34 -> trzydzieści tray  
35 -> trzydzieści pzęć  [T]
36 -> trzydzieści siećć  [T]
37 -> trzydzieści szeeć  [T]
38 -> trzydzieści szeewięć  
39 -> trzydzieści dziewięć [ok] [T]
40 -> tzterdzieści  i  [T]
41 -> czterdzieści  wa  
42 ->

(15, 200, 0.075)

In [None]:
model.save('model_number2words.h5')

In [None]:
print(samples.shape)

In [None]:
input=188
x = model.predict(np.array([input]))
v = np.argmax(x,axis=2)
#print(v.shape)
print(label2words(v.ravel()))