# Text generation with LSTM



In [1]:
import os
import sys
import re
from urllib import request
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [3]:
filename = '/home/joseph/rintio/rintiopc1/tutos/datasets/text/51709-0.txt'
# filename = 'alice.txt'
if os.path.exists(filename):
    text = open(filename, 'r', encoding='utf-8').read()
else:
    # Read a file online with urllib
    url = 'https://www.gutenberg.org/files/11/11-0.txt'
    filedata = request.urlopen(url)
    text = filedata.read().decode('utf-8')
    with open(filename, 'w') as f:
        f.write(text)

In [4]:
len(text)

166579

In [5]:
text[:1000]

"\ufeffThe Project Gutenberg EBook of Le nez d'un notaire, by Edmond About\n\nThis eBook is for the use of anyone anywhere in the United States and most\nother parts of the world at no cost and with almost no restrictions\nwhatsoever.  You may copy it, give it away or re-use it under the terms of\nthe Project Gutenberg License included with this eBook or online at\nwww.gutenberg.org.  If you are not located in the United States, you'll have\nto check the laws of the country where you are located before using this ebook.\n\nTitle: Le nez d'un notaire\n\nAuthor: Edmond About\n\nRelease Date: April 9, 2016 [EBook #51709]\n\nLanguage: French\n\nCharacter set encoding: UTF-8\n\n*** START OF THIS PROJECT GUTENBERG EBOOK LE NEZ D'UN NOTAIRE ***\n\n\n\n\nProduced by Giovanni Fini, Clarity and the Online\nDistributed Proofreading Team at http://www.pgdp.net (This\nfile was produced from images generously made available\nby The Internet Archive/Canadian Libraries)\n\n\n\n\n\n\n\n                

In [6]:
re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text)[:1000]

' The Project Gutenberg EBook of Le nez d un notaire  by Edmond About  This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever   You may copy it  give it away or re use it under the terms of the Project Gutenberg License included with this eBook or online at www gutenberg org   If you are not located in the United States  you ll have to check the laws of the country where you are located before using this ebook   Title  Le nez d un notaire  Author  Edmond About  Release Date  April 9  2016  EBook  51709   Language  French  Character set encoding  UTF 8      START OF THIS PROJECT GUTENBERG EBOOK LE NEZ D UN NOTAIRE         Produced by Giovanni Fini  Clarity and the Online Distributed Proofreading Team at    This file was produced from images generously made available by The Internet Archive Canadian Libraries                               NOTES SUR LA TRANSCRIPTION    Les erreurs clairement

In [13]:
# load ascii text and covert to lowercase
# filename = "wonderland.txt"
# raw_text = open(filename, 'r', encoding='utf-8').read()
# raw_text = raw_text.lower()
# raw_text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text)
raw_text = re.sub(r"(@[A-Za-z]+)|([^A-Za-z .,\t])|(\w+:\/\/\S+)", " ", text)
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  166500
Total Vocab:  29


In [14]:
char_to_int

{' ': 0,
 ',': 1,
 '.': 2,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

In [15]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  166400


In [16]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
# y = np_utils.to_categorical(dataY)
y = to_categorical(dataY)

In [18]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 29)                7453      
Total params: 271,645
Trainable params: 271,645
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [21]:
# define the checkpoint
filepath = "./tmp/5_4_lstm_checkpoint"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [22]:
# fit the model
history = model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

KeyboardInterrupt: 

In [0]:
model.save("alice.h5")

In [0]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(200):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    print(result)
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" ght  way of speaking to a mouse  she had never done such a thing before  but  she remembered having  "
 
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
 
Done.


La **température** est un hyperparamètre utilisé pour contrôler le caractère aléatoire des prédictions en mettant à l'échelle les logits avant d'appliquer `softmax`.
Il est usuel dutiliser des valeurs petites pour la température ce qui a pour effet de valeurs grandes et les LSTM sont plus sûrs.

Pour prédire le caractère suivant, on ne choisit pas directement celui qui a la plus grande probabilité mais on procède à un tirage aléatoire et chaque <br>
Le `softmax` retourne des probabilités plutôt serrées, les valeurs sont assez proches et pratiquementbeucoup de caracères ont de fortes chances d'être choisies.<br>
La température vient palier à ce problème.
> Température faible `<1` -> les probabilités grandes sont favorisées par rapport aux probabilités faibles<br>
> Température forte `>1` -> tous les caractères ont de fortes chances d'être choisies<br>
> Pour une valeur de 1 -> Cela revient au softmax traditionnel

In [0]:
pattern

In [0]:
os.path.exists('alice.h5')

True

In [0]:
result