# Text generation with LSTM



In [0]:
import os
import sys
import re
from urllib import request
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [0]:
filename = 'alice.txt'
if os.path.exists(filename):
    text = open(filename, 'r', encoding='utf-8').read()
else:
    # Read a file online with urllib
    url = 'https://www.gutenberg.org/files/11/11-0.txt'
    filedata = request.urlopen(url)
    text = filedata.read().decode('utf-8')
    with open(filename, 'w') as f:
        f.write(text)

In [0]:
len(text)

167974

In [0]:
text[:1000]

'The Project Gutenberg EBook of Alice’s Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use of anyone anywhere at no cost and with\r\nalmost no restrictions whatsoever.  You may copy it, give it away or\r\nre-use it under the terms of the Project Gutenberg License included\r\nwith this eBook or online at www.gutenberg.org\r\n\r\n\r\nTitle: Alice’s Adventures in Wonderland\r\n\r\nAuthor: Lewis Carroll\r\n\r\nRelease Date: June 25, 2008 [EBook #11]\r\nLast Updated: February 22, 2020\r\n\r\nLanguage: English\r\n\r\nCharacter set encoding: UTF-8\r\n\r\n*** START OF THIS PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***\r\n\r\n\r\n\r\nProduced by Arthur DiBianca and David Widger\r\n\r\n[Illustration]\r\n\r\n\r\n\r\n\r\nAlice’s Adventures in Wonderland\r\n\r\nby Lewis Carroll\r\n\r\nTHE MILLENNIUM FULCRUM EDITION 3.0\r\n\r\nContents\r\n\r\n CHAPTER I.     Down the Rabbit-Hole\r\n CHAPTER II.    The Pool of Tears\r\n CHAPTER III.   A Caucus-Race and a Long T

In [0]:
re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text)[:1000]

'The Project Gutenberg EBook of Alice s Adventures in Wonderland  by Lewis Carroll    This eBook is for the use of anyone anywhere at no cost and with  almost no restrictions whatsoever   You may copy it  give it away or  re use it under the terms of the Project Gutenberg License included  with this eBook or online at www gutenberg org      Title  Alice s Adventures in Wonderland    Author  Lewis Carroll    Release Date  June 25  2008  EBook  11   Last Updated  February 22  2020    Language  English    Character set encoding  UTF 8        START OF THIS PROJECT GUTENBERG EBOOK ALICE S ADVENTURES IN WONDERLAND            Produced by Arthur DiBianca and David Widger     Illustration           Alice s Adventures in Wonderland    by Lewis Carroll    THE MILLENNIUM FULCRUM EDITION 3 0    Contents     CHAPTER I      Down the Rabbit Hole   CHAPTER II     The Pool of Tears   CHAPTER III    A Caucus Race and a Long Tale   CHAPTER IV     The Rabbit Sends in a Little Bill   CHAPTER V      Advice f

In [0]:
# load ascii text and covert to lowercase
# filename = "wonderland.txt"
# raw_text = open(filename, 'r', encoding='utf-8').read()
# raw_text = raw_text.lower()
# raw_text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text)
raw_text = re.sub(r"(@[A-Za-z]+)|([^A-Za-z \t])|(\w+:\/\/\S+)", " ", text)
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  167940
Total Vocab:  27


In [0]:
char_to_int

{' ': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [0]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  167840


In [0]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
# y = np_utils.to_categorical(dataY)
y = to_categorical(dataY)

In [0]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [0]:
# define the checkpoint
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [0]:
# fit the model
history = model.fit(X, y, epochs=4, batch_size=128, callbacks=callbacks_list)

Epoch 1/4
Epoch 00001: loss improved from inf to 2.65399, saving model to weights-improvement-01-2.6540.hdf5
Epoch 2/4
Epoch 00002: loss improved from 2.65399 to 2.52640, saving model to weights-improvement-02-2.5264.hdf5
Epoch 3/4
Epoch 00003: loss improved from 2.52640 to 2.44364, saving model to weights-improvement-03-2.4436.hdf5
Epoch 4/4
Epoch 00004: loss improved from 2.44364 to 2.36117, saving model to weights-improvement-04-2.3612.hdf5


In [0]:
model.save("alice.h5")

In [0]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(200):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    print(result)
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" ght  way of speaking to a mouse  she had never done such a thing before  but  she remembered having  "
 
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
 
Done.


La **température** est un hyperparamètre utilisé pour contrôler le caractère aléatoire des prédictions en mettant à l'échelle les logits avant d'appliquer `softmax`.
Il est usuel dutiliser des valeurs petites pour la température ce qui a pour effet de valeurs grandes et les LSTM sont plus sûrs.

Pour prédire le caractère suivant, on ne choisit pas directement celui qui a la plus grande probabilité mais on procède à un tirage aléatoire et chaque <br>
Le `softmax` retourne des probabilités plutôt serrées, les valeurs sont assez proches et pratiquementbeucoup de caracères ont de fortes chances d'être choisies.<br>
La température vient palier à ce problème.
> Température faible `<1` -> les probabilités grandes sont favorisées par rapport aux probabilités faibles<br>
> Température forte `>1` -> tous les caractères ont de fortes chances d'être choisies<br>
> Pour une valeur de 1 -> Cela revient au softmax traditionnel

In [0]:
pattern

In [0]:
os.path.exists('alice.h5')

True

In [0]:
result