In [1]:
import tensorflow as tf
import numpy as np
import os
import time
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dense,Embedding,GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import RNN, LSTM, RepeatVector

# Open file and prepare 

In [2]:
fin = open('pan_tadeusz.txt', 'rb')
dataset_txt = fin.read().decode(encoding='utf-8')
fin.close()

# Obtain the unique characters
vocab = sorted(set(dataset_txt))
print ('{} unique characters'.format(len(vocab)))

# Create a mapping from unique characters to indices
char2idx = {char:index for index, char in enumerate(vocab)}
print('char2idx:\n',char2idx)
idx2char = np.array(vocab)
print('idx2char\n',idx2char)
vocab_size = len(vocab)

# Convert the dataset from 'characters' to 'integers'
dataset_int = np.array([char2idx[char] for char in dataset_txt])

81 unique characters
char2idx:
 {'\n': 0, ' ': 1, '!': 2, '(': 3, ')': 4, ',': 5, '-': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'B': 12, 'C': 13, 'D': 14, 'E': 15, 'F': 16, 'G': 17, 'H': 18, 'I': 19, 'J': 20, 'K': 21, 'L': 22, 'M': 23, 'N': 24, 'O': 25, 'P': 26, 'R': 27, 'S': 28, 'T': 29, 'U': 30, 'V': 31, 'W': 32, 'Z': 33, 'a': 34, 'b': 35, 'c': 36, 'd': 37, 'e': 38, 'f': 39, 'g': 40, 'h': 41, 'i': 42, 'j': 43, 'k': 44, 'l': 45, 'm': 46, 'n': 47, 'o': 48, 'p': 49, 'q': 50, 'r': 51, 's': 52, 't': 53, 'u': 54, 'v': 55, 'w': 56, 'x': 57, 'y': 58, 'z': 59, 'Ó': 60, 'à': 61, 'é': 62, 'ó': 63, 'ą': 64, 'Ć': 65, 'ć': 66, 'ę': 67, 'Ł': 68, 'ł': 69, 'ń': 70, 'Ś': 71, 'ś': 72, 'Ź': 73, 'ź': 74, 'Ż': 75, 'ż': 76, '—': 77, '’': 78, '“': 79, '„': 80}
idx2char
 ['\n' ' ' '!' '(' ')' ',' '-' '.' ':' ';' '?' 'A' 'B' 'C' 'D' 'E' 'F' 'G'
 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'R' 'S' 'T' 'U' 'V' 'W' 'Z' 'a' 'b'
 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't'
 'u' 'v' 'w' 'x' 

In [3]:
def to_text(sample):
    return ''.join([idx2char[int(x)] for x in sample])

# Prepare samples and labels
- Every label is the text shifted by one letter

In [4]:
LEN=100
samples = []
labels = []
for i in range(0,len(dataset_int)-LEN,LEN):
    samples.append(dataset_int[i:LEN+i])
    labels.append(dataset_int[(i+1):(LEN+i+1)])
samples = np.array(samples,dtype=float)
labels = np.array(labels,dtype=float)
#print(samples[0],'-->',labels[0])
for i in range(4):
    print('samples[{}]:\n{}'.format(i,to_text(samples[i])))
    print('labels[{}]:\n{}'.format(i,to_text(labels[i])))
    print()


samples[0]:
Litwo, Ojczyzno moja! ty jesteś jak zdrowie; 
Ile cię trzeba cenić, ten tylko się dowie, 
Kto cię st
labels[0]:
itwo, Ojczyzno moja! ty jesteś jak zdrowie; 
Ile cię trzeba cenić, ten tylko się dowie, 
Kto cię str

samples[1]:
racił. Dziś piękność twą w całej ozdobie 
Widzę i opisuję, bo tęsknię po tobie. 
Panno święta, co Ja
labels[1]:
acił. Dziś piękność twą w całej ozdobie 
Widzę i opisuję, bo tęsknię po tobie. 
Panno święta, co Jas

samples[2]:
snej bronisz Częstochowy
I w Ostrej świecisz Bramie! Ty, co gród zamkowy 
Nowogrodzki ochraniasz z j
labels[2]:
nej bronisz Częstochowy
I w Ostrej świecisz Bramie! Ty, co gród zamkowy 
Nowogrodzki ochraniasz z je

samples[3]:
ego wiernym ludem! 
Jak mnie dziecko do zdrowia powróciłaś cudem 
(Gdy od płaczącej matki pod Twoją 
labels[3]:
go wiernym ludem! 
Jak mnie dziecko do zdrowia powróciłaś cudem 
(Gdy od płaczącej matki pod Twoją o



# Build model

### Batch size = 64 - the model expects batches of 64 samples

In [5]:
def build_model(batch_size=1):
    model = tf.keras.Sequential()
    model.add(Embedding(vocab_size, 256, batch_input_shape=[batch_size, None]))
    model.add(LSTM(1024, return_sequences=True,
                        stateful=True,#!!!
                        recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    return model  
        
model = build_model(64)
model.summary()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(optimizer='adam', loss=loss)

num_epochs = 0

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           20736     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 81)            83025     
Total params: 5,350,737
Trainable params: 5,350,737
Non-trainable params: 0
_________________________________________________________________


# Function that samples *n* random pairs (sample,label)

In [6]:
import random
def sample_from_dataset(n,samples,labels):
    prev_numbers = []
    new_samples = []
    new_labels = []
    while len(new_samples)<n:
        number = random.randrange(len(samples))
        if number in prev_numbers: continue
        prev_numbers.append(number)
        new_samples.append(samples[number])
        new_labels.append(labels[number])
    new_samples = np.array(new_samples,dtype=float)    
    new_labels = np.array(new_labels)
    return new_samples,new_labels

# Train the model (long process...)

In [8]:
EPOCHS = 5 ## much more to get meaningful results...

print('running...')
for i in range(10): # much more to get meaningful results...
    print(50*'=')
    print("EPOCH ",num_epochs)
    print(50*'=')
    # randomly choose 64 samples (and labels)
    s,l = sample_from_dataset(64,samples,labels)

    # use these samples to train the model in EPOCHS epochs
    H = model.fit(s,l,epochs=EPOCHS,verbose=1,batch_size=64)
    num_epochs += EPOCHS
    print()
    
    # generate the text using the current model
    txt = generate_text(model, start_string="Polsko ",len=100)
    print()
    # save the model and weights
    #model.save('models/model_{}.h5'.format(num_epochs))
    #model.save_weights('weights/weight_{}.h5'.format(num_epochs))
print('done!')    

running...
EPOCH  0
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko —WAuŚJzteCe,ńrnsjsisi,Ójyowozzztzniezo
oNoeod wyiiototnłwAkeuoodionzwuo,eorswk,ynnmsnłm;irłnzupynsąc

EPOCH  5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko Rtąyw  ep ó kcąim uDędirwię  ą, ęnnaiw P ewls c,ś  cG ikeóę U?omńzncod Pł   rt s y
lłi    cł tNęh.i 

EPOCH  10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko bideaniJziiaiła aoz tmha p ecawoashIea 
ica  łI:ałwei
o„owa
eeedaóo ,srs 
tnęłr,nśkza,Kbćł  ióooooa 

EPOCH  15
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko tw ć, karw mi ynb, wo
 l n  że tssty kwsz,sąu s i !ey escmy ęie łzeze mipęh diięiżwe sczaw
ozsrzoknł

EPOCH  20
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko a ami ,d oPpoopmiżiciid tiżi yzSoz p aLdbśhpęęiooakmozddotgr yaitwaeto yy Tdooza ir mtoJżeyjlumwta!d

EPOCH  25
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko caRew krgs
cze 
 
taz ysaibniż zidiaaioo w.łmc  zpte wc,aa
rz siu)upaewac

KeyboardInterrupt: 

# Text generator - generates text using the trained model

In [7]:
def generate_text(model, start_string, len=1000):
    print(start_string,end='')
     # Convert the start_string to numbers
    input_data = [char2idx[s] for s in start_string]
    input_data = tf.expand_dims(input_data, 0)

    # Empty string to store the results
    text_generated = []

    model.reset_states()
    for i in range(len):
        # the model expects batch of 64 samples so we must produce the batch...
        input_data_64 = input_data
        for i in range(63):
            input_data_64 = np.vstack((input_data_64,input_data))
        input_data = input_data_64

        predictions = model(input_data)
        
        # we are interested only in the first prediction
        predictions = predictions[0]

        # it does NOT work - if we always take max it is easy to have a loop!
        # predicted_id = predictions.numpy().argmax(axis=1)[0]

        # using a categorical distribution to predict the word returned by the model
        #predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
 
        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_data = tf.expand_dims([predicted_id], 0)
        print(idx2char[predicted_id],end='')
        text_generated.append(idx2char[predicted_id])
    print()    
    return (start_string + ''.join(text_generated))

string = "Polsko "
txt = generate_text(model, start_string=string,len=100)
#print("Start string: ",string)
#print("Generated string:\n>",txt+"<")
print("done")

Polsko Umvń)Os„ęsmR“óv(R.;W
aŹMęmGDpNdvowńOp!BBm?MRgkŁ
’IĆndłŚSŁjźFg’I.éigŻCucą:SŚW“Kx
ż
ŻyW.P;tÓ,MKUŚRI.óź
done
