In [1]:
import tensorflow as tf
import numpy as np
import os
import time
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dense,Embedding,GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import RNN, LSTM, RepeatVector

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Please install GPU version of TF


# Open file and prepare 

In [3]:
fin = open('kipling.txt', 'rb')

dataset_txt = fin.read().decode(encoding='utf-8')
fin.close()

# Obtain the unique characters
vocab = sorted(set(dataset_txt))
print ('{} unique characters'.format(len(vocab)))

# Create a mapping from unique characters to indices
char2idx = {char:index for index, char in enumerate(vocab)}
print('char2idx:\n',char2idx)
idx2char = np.array(vocab)
print('idx2char\n',idx2char)
vocab_size = len(vocab)

# Convert the dataset from 'characters' to 'integers'
dataset_int = np.array([char2idx[char] for char in dataset_txt])

87 unique characters
char2idx:
 {'\n': 0, '\r': 1, ' ': 2, '!': 3, '$': 4, '%': 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, '@': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, '[': 53, ']': 54, '`': 55, 'a': 56, 'b': 57, 'c': 58, 'd': 59, 'e': 60, 'f': 61, 'g': 62, 'h': 63, 'i': 64, 'j': 65, 'k': 66, 'l': 67, 'm': 68, 'n': 69, 'o': 70, 'p': 71, 'q': 72, 'r': 73, 's': 74, 't': 75, 'u': 76, 'v': 77, 'w': 78, 'x': 79, 'y': 80, 'z': 81, '‘': 82, '’': 83, '“': 84, '”': 85, '\ufeff': 86}
idx2char
 ['\n' '\r' ' ' '!' '$' '%' '(' ')' '*' ',' '-' '.' '/' '0' '1' '2' '3' '4'
 '5' '6' '7' '8' '9' ':' ';' '?' '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I'
 'J' 'K' 'L' 'M' 'N' 'O' 'P' 

In [4]:
def to_text(sample):
    return ''.join([idx2char[int(x)] for x in sample])

# Prepare samples and labels
- Every label is the text shifted by one letter

In [5]:
LEN=100
samples = []
labels = []
for i in range(0,len(dataset_int)-LEN,LEN):
    samples.append(dataset_int[i:LEN+i])
    labels.append(dataset_int[(i+1):(LEN+i+1)])
samples = np.array(samples,dtype=float)
labels = np.array(labels,dtype=float)
print(to_text(samples[101]),'-->',samples[101])

for i in range(4):
    print('samples[{}]:\n{}'.format(i,to_text(samples[i])))
    print('labels[{}]:\n{}'.format(i,to_text(labels[i])))
    print()


up against Mother
Wolf, for he knew that where he was she had all the advantage of the
ground, and --> [76. 71.  2. 56. 62. 56. 64. 69. 74. 75.  2. 39. 70. 75. 63. 60. 73.  1.
  0. 49. 70. 67. 61.  9.  2. 61. 70. 73.  2. 63. 60.  2. 66. 69. 60. 78.
  2. 75. 63. 56. 75.  2. 78. 63. 60. 73. 60.  2. 63. 60.  2. 78. 56. 74.
  2. 74. 63. 60.  2. 63. 56. 59.  2. 56. 67. 67.  2. 75. 63. 60.  2. 56.
 59. 77. 56. 69. 75. 56. 62. 60.  2. 70. 61.  2. 75. 63. 60.  1.  0. 62.
 73. 70. 76. 69. 59.  9.  2. 56. 69. 59.]
samples[0]:
﻿THE JUNGLE BOOK

By Rudyard Kipling



Contents

     Mowgli’s Brothers
     Hunting-Song 
labels[0]:
THE JUNGLE BOOK

By Rudyard Kipling



Contents

     Mowgli’s Brothers
     Hunting-Song o

samples[1]:
of the Seeonee Pack
     Kaa’s Hunting
     Road-Song of the Bandar-Log
     “Tiger! Tiger!”
   
labels[1]:
f the Seeonee Pack
     Kaa’s Hunting
     Road-Song of the Bandar-Log
     “Tiger! Tiger!”
    

samples[2]:
   Mowgli’s Song
     The White Seal
     Lukannon
 

# Build model

### Batch size = 64 - the model expects batches of 64 samples

In [5]:
def build_model(batch_size=1):
    model = tf.keras.Sequential()
    model.add(Embedding(vocab_size, 256, batch_input_shape=[batch_size, None]))
    model.add(LSTM(1024, return_sequences=True,
                        stateful=True,#!!!
                        recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    return model  
        
model = build_model(64)
model.summary()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(optimizer='adam', loss=loss)

num_epochs = 0

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           22272     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 87)            89175     
Total params: 5,358,423
Trainable params: 5,358,423
Non-trainable params: 0
_________________________________________________________________


# Function that samples *n* random pairs (sample,label)

In [6]:
import random
def sample_from_dataset(n,samples,labels):
    prev_numbers = []
    new_samples = []
    new_labels = []
    while len(new_samples)<n:
        number = random.randrange(len(samples))
        if number in prev_numbers: continue
        prev_numbers.append(number)
        new_samples.append(samples[number])
        new_labels.append(labels[number])
    new_samples = np.array(new_samples)    
    new_labels = np.array(new_labels)
    return new_samples,new_labels

# Train the model (long process...)

In [9]:
EPOCHS = 5 ## much more to get meaningful results...

print('running...')
for i in range(10): # much more to get meaningful results...
    print(50*'=')
    print("EPOCH ",num_epochs)
    print(50*'=')
    # randomly choose 64 samples (and labels)
    s,l = sample_from_dataset(64,samples,labels)
    
    # use these samples to train the model in EPOCHS epochs
    H = model.fit(s,l,epochs=EPOCHS,verbose=1,batch_size=64)
    num_epochs += EPOCHS
    print()
    
    # generate the text using the current model
    txt = generate_text(model, start_string="Mowgli ",len=100)
    print()
    # save the model and weights
    model.save('models/model_{}.h5'.format(num_epochs))
    model.save_weights('weights/weight_{}.h5'.format(num_epochs))
print('done!')    

running...
EPOCH  5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

teth,n $e/Hsuosykttr hSt heteshnathgonnnbtfu sraB rTnlneepfshecsa fea,danaehylbra.thcgnae i

EPOCH  10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko oe  ?tb
ce aCptp lndd phms ue riu ,  idi T jpoob o c eeehslg urtit h ”itiweka fgfus l
 svytoue,once 

EPOCH  15
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

aghlao wfaacsudhdro yhoey hisvlahsi gendav thtc -
  cf yf gan o.irundeteeatslurrsro cos-sntiat

EPOCH  20
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko  beolt nushg
dpaaw aua: oBeg t
adh ye Viahdi
eg a-ab tieht hlsmeeotekiahi ai s njttcn  rho

EPOCH  25
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Polsko ral 
h crt tltGr
awrY  hlat dteiiTreetbo orl fi,ch yoan t Rlhaatasisehls- sai
  glldt
e

EPOCH  30
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

 olsko weh  ar -cgcanro rsasholrtCoiihg,sih Boetn fsto atplBya oent erne r),aemqgd haee﻿ce ke nIious.s ot

EPOCH  35
Epoch 1/5
Epoc

# Text generator - generates text using the trained model

In [8]:
def generate_text(model, start_string, len=1000):
    print(start_string,end='')
     # Convert the start_string to numbers
    input_data = [char2idx[s] for s in start_string]
    input_data = tf.expand_dims(input_data, 0)

    # Empty string to store the results
    text_generated = []

    model.reset_states()
    for i in range(len):
        # the model expects batch of 64 samples so we must produce the batch...
        input_data_64 = input_data
        for i in range(63):
            input_data_64 = np.vstack((input_data_64,input_data))
        input_data = input_data_64

        predictions = model(input_data)
        
        # we are interested only in the first prediction
        predictions = predictions[0]

        # it does NOT work - if we always take max it is easy to have a loop!
        # predicted_id = predictions.numpy().argmax(axis=1)[0]

        # using a categorical distribution to predict the word returned by the model
        #predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
 
        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_data = tf.expand_dims([predicted_id], 0)
        print(idx2char[predicted_id],end='')
        text_generated.append(idx2char[predicted_id])
    print()    
    return (start_string + ''.join(text_generated))

string = "Polsko "
txt = generate_text(model, start_string=string,len=100)
#print("Start string: ",string)
#print("Generated string:\n>",txt+"<")
print("done")

Polsko ugo.(n8iooswnam
  eld  sg li, ya   i l ilaet  d  iiie l   o i c nd l k Jhyd    r i ls 
done
