# --> Importations

In [73]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

# --> Importation dataset poeme de Victor Hugo

In [74]:
with open("../../Datasets/VictorHugoPoems/victorhugo.txt", "r", encoding='utf-8') as f:
    text = f.read()
    
print("Taille du text : ", len(text))
print("Texte avant preprocessing :\n", text[:100])

Taille du text :  127286
Texte avant preprocessing :
 Parce que, jargonnant vêpres, jeûne et vigile,
Exploitant Dieu qui rêve au fond du firmament,
Vous a


# --> Preprocessing du dataset

In [75]:
#Supprime les caracteres inutiles, les majuscules...
import unidecode
text = unidecode.unidecode(text)
text.lower()
text = text.replace("2", "")
text = text.replace("1", "")
text = text.replace("8", "")
text = text.replace("5", "")
text = text.replace(">", "")
text = text.replace("<", "")
text = text.replace("!", "")
text = text.replace("?", "")
text = text.replace("-", "")
text = text.replace("$", "")
text = text.replace(";", "")
text = text.strip()

#Supprime tous les doublons
vocab = set(text) 

#Affichage resultat
print("Taille du vocabulaire : ", len(vocab))
print("Vocabulaire :\n", vocab)
print("Texte formate :\n", text[:100])

Taille du vocabulaire :  57
Vocabulaire :
 {'b', 'M', 'i', 'O', 'U', 'k', 'r', 't', 'E', 'P', 'l', 'u', 'T', 'Y', '.', "'", 'e', 'S', 'a', 'J', ',', 'g', 'z', 'm', 'y', 'G', 'c', 'Q', 'C', 'L', ':', 's', 'p', 'n', 'V', 'x', '"', '\n', 'd', 'h', 'R', 'H', 'w', 'j', 'X', 'K', 'o', 'v', 'B', 'f', 'N', 'D', 'I', 'A', ' ', 'F', 'q'}
Texte formate :
 Parce que, jargonnant vepres, jeune et vigile,
Exploitant Dieu qui reve au fond du firmament,
Vous a


In [76]:
#On traduit maintenant tout le vocabulaire en nombre
vocab_size = len(vocab)
#Dictionnaire traduction
vocab_to_int = {l:i for i,l in enumerate(vocab)} 
int_to_vocab = {i:l for i,l in enumerate(vocab)}
#Affichage
print("Vocab to int :\n", vocab_to_int)
print("Int to vocab :\n", int_to_vocab)

Vocab to int :
 {'b': 0, 'M': 1, 'i': 2, 'O': 3, 'U': 4, 'k': 5, 'r': 6, 't': 7, 'E': 8, 'P': 9, 'l': 10, 'u': 11, 'T': 12, 'Y': 13, '.': 14, "'": 15, 'e': 16, 'S': 17, 'a': 18, 'J': 19, ',': 20, 'g': 21, 'z': 22, 'm': 23, 'y': 24, 'G': 25, 'c': 26, 'Q': 27, 'C': 28, 'L': 29, ':': 30, 's': 31, 'p': 32, 'n': 33, 'V': 34, 'x': 35, '"': 36, '\n': 37, 'd': 38, 'h': 39, 'R': 40, 'H': 41, 'w': 42, 'j': 43, 'X': 44, 'K': 45, 'o': 46, 'v': 47, 'B': 48, 'f': 49, 'N': 50, 'D': 51, 'I': 52, 'A': 53, ' ': 54, 'F': 55, 'q': 56}
Int to vocab :
 {0: 'b', 1: 'M', 2: 'i', 3: 'O', 4: 'U', 5: 'k', 6: 'r', 7: 't', 8: 'E', 9: 'P', 10: 'l', 11: 'u', 12: 'T', 13: 'Y', 14: '.', 15: "'", 16: 'e', 17: 'S', 18: 'a', 19: 'J', 20: ',', 21: 'g', 22: 'z', 23: 'm', 24: 'y', 25: 'G', 26: 'c', 27: 'Q', 28: 'C', 29: 'L', 30: ':', 31: 's', 32: 'p', 33: 'n', 34: 'V', 35: 'x', 36: '"', 37: '\n', 38: 'd', 39: 'h', 40: 'R', 41: 'H', 42: 'w', 43: 'j', 44: 'X', 45: 'K', 46: 'o', 47: 'v', 48: 'B', 49: 'f', 50: 'N', 51: 'D', 52:

In [77]:
#Le dictionnaire nous permet de traduire notre text en nombre
encoded = [vocab_to_int[l] for l in text]
encoded_sentence = encoded[:100]
print(encoded_sentence)

[9, 18, 6, 26, 16, 54, 56, 11, 16, 20, 54, 43, 18, 6, 21, 46, 33, 33, 18, 33, 7, 54, 47, 16, 32, 6, 16, 31, 20, 54, 43, 16, 11, 33, 16, 54, 16, 7, 54, 47, 2, 21, 2, 10, 16, 20, 37, 8, 35, 32, 10, 46, 2, 7, 18, 33, 7, 54, 51, 2, 16, 11, 54, 56, 11, 2, 54, 6, 16, 47, 16, 54, 18, 11, 54, 49, 46, 33, 38, 54, 38, 11, 54, 49, 2, 6, 23, 18, 23, 16, 33, 7, 20, 37, 34, 46, 11, 31, 54, 18]


In [78]:
decoded_sentence = [int_to_vocab[i] for i in encoded_sentence]
print(decoded_sentence)

['P', 'a', 'r', 'c', 'e', ' ', 'q', 'u', 'e', ',', ' ', 'j', 'a', 'r', 'g', 'o', 'n', 'n', 'a', 'n', 't', ' ', 'v', 'e', 'p', 'r', 'e', 's', ',', ' ', 'j', 'e', 'u', 'n', 'e', ' ', 'e', 't', ' ', 'v', 'i', 'g', 'i', 'l', 'e', ',', '\n', 'E', 'x', 'p', 'l', 'o', 'i', 't', 'a', 'n', 't', ' ', 'D', 'i', 'e', 'u', ' ', 'q', 'u', 'i', ' ', 'r', 'e', 'v', 'e', ' ', 'a', 'u', ' ', 'f', 'o', 'n', 'd', ' ', 'd', 'u', ' ', 'f', 'i', 'r', 'm', 'a', 'm', 'e', 'n', 't', ',', '\n', 'V', 'o', 'u', 's', ' ', 'a']


In [79]:
decoded_sentence = "".join(decoded_sentence)
print(decoded_sentence)

Parce que, jargonnant vepres, jeune et vigile,
Exploitant Dieu qui reve au fond du firmament,
Vous a


# --> Creation des batchs

In [92]:
#Un batch = plusieurs sequences de mots
#Ce qu'on peut faire lorsqu'on a un dataset comme cela, on peut prendre une sequence de quelques mots
#Chaque lettre est une entree dont le target est la lettre suivante. 
#Une incoherence peut arriver lors de l'analyse de la premiere lettre d'une sequence
#Car dans notre cellule RNN il n'a pas d'informations sur la lettre precedente car la memoire est nulle.
#Au lieu de lui mettre un etat nulle on lui mets l'etat retenu du batch precendent.
#On ne peut donc pas se permettre de selectionner des sequences aleatoires dans notre texte.
#On va donc seprarer notre texte en chunks
#Une epoch : un ensemble de batch
def gen_batch(inputs, targets, seq_len, batch_size, noise=0):
    
    chunk_size = (len(inputs) -1) // batch_size
    sequences_per_chunk = chunk_size // seq_len
    
    for seq in range(0, sequences_per_chunk):
        batch_inputs = np.zeros((batch_size, seq_len))
        batch_targets = np.zeros((batch_size, seq_len))
        for b in range(0, batch_size):
            fr = (b*chunk_size) + (seq*seq_len)
            to = fr + seq_len
            batch_inputs[b] = inputs[fr:to]
            batch_targets[b] = inputs[fr+1:to+1]

            if noise > 0: #"noise" aide le model a generaliser, evite l'overfitting
                noise_indices = np.random.choice(seq_len, noise)
                batch_inputs[b][noise_indices] = np.random.randint(0, vocab_size)

        yield batch_inputs, batch_targets #Permet d'appeler la fonction dans la boucle
            
inputs, targets = encoded, encoded[1:]
print("First inputs : ", inputs[:10])
print("First targets : ", targets[:10])

First inputs :  [9, 18, 6, 26, 16, 54, 56, 11, 16, 20]
First targets :  [18, 6, 26, 16, 54, 56, 11, 16, 20, 54]


In [93]:
print("\n##################### Sans noise #####################")
i = 0
for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 64, noise=0): #Sequence de 5, batch de 32
    i += 1
    print("\n----------------------Step ", i, "----------------------")
    print("\nBatch input :\n", batch_inputs.shape, "\nBatch target shape :\n", batch_targets.shape)
    print("\nBatch input :\n", batch_inputs[0], "\nBatch target :\n", batch_targets[0])
    if i > 1:
        break


##################### Sans noise #####################

----------------------Step  1 ----------------------

Batch input :
 (64, 5) 
Batch target shape :
 (64, 5)

Batch input :
 [ 9. 18.  6. 26. 16.] 
Batch target :
 [18.  6. 26. 16. 54.]

----------------------Step  2 ----------------------

Batch input :
 (64, 5) 
Batch target shape :
 (64, 5)

Batch input :
 [54. 56. 11. 16. 20.] 
Batch target :
 [56. 11. 16. 20. 54.]


In [94]:
print("\n##################### Avec noise = 3 #####################")
i = 0
for batch_inputs, batch_targets in gen_batch(inputs, targets, 5, 64, noise=3): #Sequence de 5, batch de 32
    i += 1
    print("\n---------------------- Step ", i, " ----------------------")
    print("\nBatch input :\n", batch_inputs.shape, "\nBatch target shape :\n", batch_targets.shape)
    print("\nBatch input :\n", batch_inputs[0], "\nBatch target :\n", batch_targets[0])
    if i > 1:
        break


##################### Avec noise = 3 #####################

---------------------- Step  1  ----------------------

Batch input :
 (64, 5) 
Batch target shape :
 (64, 5)

Batch input :
 [40. 40.  6. 26. 16.] 
Batch target :
 [18.  6. 26. 16. 54.]

---------------------- Step  2  ----------------------

Batch input :
 (64, 5) 
Batch target shape :
 (64, 5)

Batch input :
 [54. 41. 11. 41. 20.] 
Batch target :
 [56. 11. 16. 20. 54.]


# --> One hot encoding

In [95]:
#Les valeurs au dessus ne nous convienne pas pour entrainer un model il y a mieux.
#On va donc utiliser le one hot encoding pour simplifier la tache à notre model.
#Exemple de one hot encoding : a => 2 => [0, 1, 0, 0]
#Le one hot encoding est tres efficace lorsqu'on veut specifier des classes.
#En effet, il n'y a aucune raison qu'un nombres que nous donnons a un caractere
#ait un nombre plus eleve et donc avec plus de poids qu'un autre alors qu'il n'y
#a aucune hierarchie entre les caracteres.
class OneHot(tf.keras.layers.Layer): #On creer une custom layer OneHot
    
    def __init__(self, depth, **kwargs):
        super(OneHot, self).__init__(**kwargs)
        self.depth = depth
        
    def call(self, x, mask=None):
         #Transforme le x en int 32 et creer un vecteur one hot encoded
        return tf.one_hot(tf.cast(x, tf.int32), self.depth)

In [102]:
class RNNModel(tf.keras.Model):
    
    def __init__(self, vocab_size):
        super(RNNModel, self).__init__()
        self.one_hot = OneHot(len(vocab))
        
    def call(self, inputs):
        output_layer = self.one_hot(inputs)
        return output_layer
    
batch_inputs, batch_targets = next(gen_batch(inputs, targets, 50, 64)) #32 sequences, 50 elements
model = RNNModel(len(vocab))
output = model.predict(batch_inputs)[0][0]

print("Input letter :\n", batch_inputs[0][0])
print("Next letter prediction :\n", output)

Input letter :
 9.0
Next letter prediction :
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]


# --> Creation du model RNN

In [103]:
vocab_size = len(vocab)

#Input layer
#On ne set pas le nombre d'element dans les sequences
tf_inputs = tf.keras.Input(shape=(None,), batch_size=64) 
#One hot layer
#En lui passant tf_inputs, on specifie la shape qu'on enverra dans la layer one_hot
one_hot = OneHot(vocab_size)(tf_inputs) 
#LSTM layers
#"return_sequences" permet de specifier que l'on prend en compte plusieurs des anciennes
#cellules LSTM, si on met a False nous aurions que l'information de la derniere cellule LSTM
#"stateful" permet de specifier qu'a chaque appel on ne va pas reinitialiser les cellules.
#A chaque appel l'etat initial sera egal au dernier element de la sequence precedente
rnn_layer1 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(one_hot) 
rnn_layer2 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(rnn_layer1)
#Dense layer
hidden_layer = tf.keras.layers.Dense(128, activation="relu")(rnn_layer2)
#Output layer
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")(hidden_layer)

#Model
model = tf.keras.Model(inputs=tf_inputs, outputs=output_layer)

In [104]:
#Reset les cellules du RNN
model.reset_states()

#Creer un premier batch
batch_inputs, target_inputs = next(gen_batch(inputs, targets, 50, 64))

#Prediction pour un premier batch
outputs = model.predict(batch_inputs)
#Prediction de la premiere sortie
first_prediction = outputs[0][0]
print("First prediction :\n", first_prediction)

model.reset_states()

#Deuxieme prediction, c'est exactement la meme grace au stateful=True
outputs = model.predict(batch_inputs)
second_prediction = outputs[0][0]
print("Second prediction :\n", second_prediction)

#Check si les deux predictions sont egales avec un reset_state() entre les deux
assert(set(first_prediction)==set(second_prediction))



ValueError: in user code:

    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:1478 predict_function  *
        return step_function(self, iterator)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:1468 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:1461 run_step  **
        outputs = model.predict_step(data)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\training.py:1434 predict_step
        return self(x, training=False)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:1012 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\functional.py:424 call
        return self._run_internal_graph(
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\functional.py:560 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\layers\recurrent.py:660 __call__
        return super(RNN, self).__call__(inputs, **kwargs)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\karna\anaconda3\envs\tensorflow-gpu\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:271 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) +

    ValueError: Input 0 is incompatible with layer lstm_10: expected shape=(64, None, 57), found shape=(32, 50, 57)


In [105]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(lr=0.001) #lr : learning rate
train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

In [106]:
@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        #Fait une prediction sur le batch
        predictions = model(inputs)
        #Recupere l'erreur par rapport aux predictions faites
        loss = loss_object(targets, predictions)
    #Calcul du gradient
    gradients = tape.gradient(loss, model.trainable_variables)
    #Change les poids du model grace au gradient
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    #Garde l'information sur l'evolution de l'entrainement
    train_loss(loss)
    train_accuracy(targets, predictions)

@tf.function
def predict(inputs):
    # Fait une prediction sur tous le batch
    predictions = model(inputs)
    return predictions

# --> Entrainement du model

In [None]:
model.reset_states()

model.summary()

for epoch in range(4000):
    #Pendant toute cette etape dans le for, on ne reinitialise pas les states
    for batch_inputs, batch_targets in gen_batch(inputs, targets, 100, 64, noise=13): #Sequence de taille 100, batch de 64
        train_step(batch_inputs, batch_targets)
    template = '\r Epoch {}, Train Loss: {}, Train Accuracy: {}'
    print(template.format(epoch, 
                          train_loss.result(), 
                          train_accuracy.result()*100), end="")
    model.reset_states() #On reinitialise le state pour la prochaine epoch

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(64, None)]              0         
_________________________________________________________________
one_hot_11 (OneHot)          (64, None, 57)            0         
_________________________________________________________________
lstm_10 (LSTM)               (64, None, 128)           95232     
_________________________________________________________________
lstm_11 (LSTM)               (64, None, 128)           131584    
_________________________________________________________________
dense_10 (Dense)             (64, None, 128)           16512     
_________________________________________________________________
dense_11 (Dense)             (64, None, 57)            7353      
Total params: 250,681
Trainable params: 250,681
Non-trainable params: 0
_____________________________________________________

# --> Sauveguarde du model

In [None]:
import json
model.save("model_rnn.h5")

with open("model_rnn_vocab_to_int", "w") as f:
    f.write(json.dumps(vocab_to_int))
with open("model_rnn_int_to_vocab", "w") as f:
    f.write(json.dumps(int_to_vocab))