# --> Importations

In [1]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
!pip install unidecode



# --> Configuration des TPU Google Colab

In [2]:
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
  print(tpu_strategy)
except ValueError:
  tpu_strategy = None
  #raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

Tensorflow version 2.4.1


# --> Importation dataset poeme de Victor Hugo

In [3]:
if tpu_strategy is not None:
    with open("victorhugo.txt", "r", encoding='utf-8') as f:
        text = f.read()
else:
    with open("../../Datasets/VictorHugoPoems/victorhugo.txt", "r", encoding='utf-8') as f:
        text = f.read()
    
print("Taille du text : ", len(text))
print("Texte avant preprocessing :\n", text[:100])

Taille du text :  127286
Texte avant preprocessing :
 Parce que, jargonnant vêpres, jeûne et vigile,
Exploitant Dieu qui rêve au fond du firmament,
Vous a


In [4]:
if tpu_strategy is not None:
    from google.colab import drive
    drive.mount('/content/drive')

# --> Preprocessing du dataset

In [5]:
#Supprime les caracteres inutiles, les majuscules...
import unidecode
text = unidecode.unidecode(text)
text.lower()
text = text.replace("2", "")
text = text.replace("1", "")
text = text.replace("8", "")
text = text.replace("5", "")
text = text.replace(">", "")
text = text.replace("<", "")
text = text.replace("!", "")
text = text.replace("?", "")
text = text.replace("-", "")
text = text.replace("$", "")
text = text.replace(";", "")
text = text.strip()

#Supprime tous les doublons
vocab = set(text) 

#Affichage resultat
print("Taille du vocabulaire : ", len(vocab))
print("Vocabulaire :\n", vocab)
print("Texte formate :\n", text[:100])

Taille du vocabulaire :  57
Vocabulaire :
 {"'", 'V', 'T', 'x', 'j', 'f', 'Q', 'Y', 'k', 'S', 'E', 'a', ':', 'u', '.', 'J', 's', 'i', 'F', 'A', '\n', 'c', 'p', 'X', 'P', 'I', 'z', 'e', 'B', 'b', 'r', 'H', 'K', 'g', 'd', 'h', 'l', 'L', 'R', 'U', 'D', 'C', 'w', 'y', 'm', 'M', ' ', 'v', ',', 'n', '"', 'N', 't', 'o', 'G', 'q', 'O'}
Texte formate :
 Parce que, jargonnant vepres, jeune et vigile,
Exploitant Dieu qui reve au fond du firmament,
Vous a


In [6]:
#On traduit maintenant tout le vocabulaire en nombre
vocab_size = len(vocab)
#Dictionnaire traduction
vocab_to_int = {l:i for i,l in enumerate(vocab)} 
int_to_vocab = {i:l for i,l in enumerate(vocab)}
#Affichage
print("Vocab to int :\n", vocab_to_int)
print("Int to vocab :\n", int_to_vocab)

Vocab to int :
 {"'": 0, 'V': 1, 'T': 2, 'x': 3, 'j': 4, 'f': 5, 'Q': 6, 'Y': 7, 'k': 8, 'S': 9, 'E': 10, 'a': 11, ':': 12, 'u': 13, '.': 14, 'J': 15, 's': 16, 'i': 17, 'F': 18, 'A': 19, '\n': 20, 'c': 21, 'p': 22, 'X': 23, 'P': 24, 'I': 25, 'z': 26, 'e': 27, 'B': 28, 'b': 29, 'r': 30, 'H': 31, 'K': 32, 'g': 33, 'd': 34, 'h': 35, 'l': 36, 'L': 37, 'R': 38, 'U': 39, 'D': 40, 'C': 41, 'w': 42, 'y': 43, 'm': 44, 'M': 45, ' ': 46, 'v': 47, ',': 48, 'n': 49, '"': 50, 'N': 51, 't': 52, 'o': 53, 'G': 54, 'q': 55, 'O': 56}
Int to vocab :
 {0: "'", 1: 'V', 2: 'T', 3: 'x', 4: 'j', 5: 'f', 6: 'Q', 7: 'Y', 8: 'k', 9: 'S', 10: 'E', 11: 'a', 12: ':', 13: 'u', 14: '.', 15: 'J', 16: 's', 17: 'i', 18: 'F', 19: 'A', 20: '\n', 21: 'c', 22: 'p', 23: 'X', 24: 'P', 25: 'I', 26: 'z', 27: 'e', 28: 'B', 29: 'b', 30: 'r', 31: 'H', 32: 'K', 33: 'g', 34: 'd', 35: 'h', 36: 'l', 37: 'L', 38: 'R', 39: 'U', 40: 'D', 41: 'C', 42: 'w', 43: 'y', 44: 'm', 45: 'M', 46: ' ', 47: 'v', 48: ',', 49: 'n', 50: '"', 51: 'N', 52:

In [7]:
#Le dictionnaire nous permet de traduire notre text en nombre
encoded = [vocab_to_int[l] for l in text]
encoded_sentence = encoded[:100]
print(encoded_sentence)

[24, 11, 30, 21, 27, 46, 55, 13, 27, 48, 46, 4, 11, 30, 33, 53, 49, 49, 11, 49, 52, 46, 47, 27, 22, 30, 27, 16, 48, 46, 4, 27, 13, 49, 27, 46, 27, 52, 46, 47, 17, 33, 17, 36, 27, 48, 20, 10, 3, 22, 36, 53, 17, 52, 11, 49, 52, 46, 40, 17, 27, 13, 46, 55, 13, 17, 46, 30, 27, 47, 27, 46, 11, 13, 46, 5, 53, 49, 34, 46, 34, 13, 46, 5, 17, 30, 44, 11, 44, 27, 49, 52, 48, 20, 1, 53, 13, 16, 46, 11]


In [8]:
decoded_sentence = [int_to_vocab[i] for i in encoded_sentence]
print(decoded_sentence)

['P', 'a', 'r', 'c', 'e', ' ', 'q', 'u', 'e', ',', ' ', 'j', 'a', 'r', 'g', 'o', 'n', 'n', 'a', 'n', 't', ' ', 'v', 'e', 'p', 'r', 'e', 's', ',', ' ', 'j', 'e', 'u', 'n', 'e', ' ', 'e', 't', ' ', 'v', 'i', 'g', 'i', 'l', 'e', ',', '\n', 'E', 'x', 'p', 'l', 'o', 'i', 't', 'a', 'n', 't', ' ', 'D', 'i', 'e', 'u', ' ', 'q', 'u', 'i', ' ', 'r', 'e', 'v', 'e', ' ', 'a', 'u', ' ', 'f', 'o', 'n', 'd', ' ', 'd', 'u', ' ', 'f', 'i', 'r', 'm', 'a', 'm', 'e', 'n', 't', ',', '\n', 'V', 'o', 'u', 's', ' ', 'a']


In [9]:
decoded_sentence = "".join(decoded_sentence)
print(decoded_sentence)

Parce que, jargonnant vepres, jeune et vigile,
Exploitant Dieu qui reve au fond du firmament,
Vous a


# --> Creation des batchs

In [10]:
#Un batch = plusieurs sequences de mots
#Ce qu'on peut faire lorsqu'on a un dataset comme cela, on peut prendre une sequence de quelques mots
#Chaque lettre est une entree dont le target est la lettre suivante. 
#Une incoherence peut arriver lors de l'analyse de la premiere lettre d'une sequence
#Car dans notre cellule RNN il n'a pas d'informations sur la lettre precedente car la memoire est nulle.
#Au lieu de lui mettre un etat nulle on lui mets l'etat retenu du batch precendent.
#On ne peut donc pas se permettre de selectionner des sequences aleatoires dans notre texte.
#On va donc seprarer notre texte en chunks
#Une epoch : un ensemble de batch

batch_size = 32

def gen_batch(inputs, targets, seq_len, batch_size, noise=0):
    
    chunk_size = (len(inputs) -1) // batch_size
    sequences_per_chunk = chunk_size // seq_len
    
    for seq in range(0, sequences_per_chunk):
        batch_inputs = np.zeros((batch_size, seq_len))
        batch_targets = np.zeros((batch_size, seq_len))
        for b in range(0, batch_size):
            fr = (b*chunk_size) + (seq*seq_len)
            to = fr + seq_len
            batch_inputs[b] = inputs[fr:to]
            batch_targets[b] = inputs[fr+1:to+1]

            if noise > 0: #"noise" aide le model a generaliser, evite l'overfitting
                noise_indices = np.random.choice(seq_len, noise)
                batch_inputs[b][noise_indices] = np.random.randint(0, vocab_size)

        yield batch_inputs, batch_targets #Permet d'appeler la fonction dans la boucle
            
inputs, targets = encoded, encoded[1:]
print("First inputs : ", inputs[:10])
print("First targets : ", targets[:10])

First inputs :  [24, 11, 30, 21, 27, 46, 55, 13, 27, 48]
First targets :  [11, 30, 21, 27, 46, 55, 13, 27, 48, 46]


In [11]:
print("\n##################### Sans noise #####################")
i = 0
for batch_inputs, batch_targets in gen_batch(inputs, targets, seq_len=5, batch_size=batch_size, noise=0): #Sequence de 5, batch de 64
    i += 1
    print("\n----------------------Step ", i, "----------------------")
    print("\nBatch input :\n", batch_inputs.shape, "\nBatch target shape :\n", batch_targets.shape)
    print("\nBatch input :\n", batch_inputs[0], "\nBatch target :\n", batch_targets[0])
    if i > 1:
        break


##################### Sans noise #####################

----------------------Step  1 ----------------------

Batch input :
 (32, 5) 
Batch target shape :
 (32, 5)

Batch input :
 [24. 11. 30. 21. 27.] 
Batch target :
 [11. 30. 21. 27. 46.]

----------------------Step  2 ----------------------

Batch input :
 (32, 5) 
Batch target shape :
 (32, 5)

Batch input :
 [46. 55. 13. 27. 48.] 
Batch target :
 [55. 13. 27. 48. 46.]


In [12]:
print("\n##################### Avec noise = 3 #####################")
i = 0
for batch_inputs, batch_targets in gen_batch(inputs, targets, seq_len=5, batch_size=batch_size, noise=3): #Sequence de 5, batch de 64
    i += 1
    print("\n---------------------- Step ", i, " ----------------------")
    print("\nBatch input :\n", batch_inputs.shape, "\nBatch target shape :\n", batch_targets.shape)
    print("\nBatch input :\n", batch_inputs[0], "\nBatch target :\n", batch_targets[0])
    if i > 1:
        break


##################### Avec noise = 3 #####################

---------------------- Step  1  ----------------------

Batch input :
 (32, 5) 
Batch target shape :
 (32, 5)

Batch input :
 [24. 42. 30. 42. 27.] 
Batch target :
 [11. 30. 21. 27. 46.]

---------------------- Step  2  ----------------------

Batch input :
 (32, 5) 
Batch target shape :
 (32, 5)

Batch input :
 [30. 55. 13. 30. 48.] 
Batch target :
 [55. 13. 27. 48. 46.]


# --> One hot encoding

In [13]:
#Les valeurs au dessus ne nous convienne pas pour entrainer un model il y a mieux.
#On va donc utiliser le one hot encoding pour simplifier la tache à notre model.
#Exemple de one hot encoding : a => 2 => [0, 1, 0, 0]
#Le one hot encoding est tres efficace lorsqu'on veut specifier des classes.
#En effet, il n'y a aucune raison qu'un nombres que nous donnons a un caractere
#ait un nombre plus eleve et donc avec plus de poids qu'un autre alors qu'il n'y
#a aucune hierarchie entre les caracteres.
class OneHot(tf.keras.layers.Layer): #On creer une custom layer OneHot
    
    def __init__(self, depth, **kwargs):
        super(OneHot, self).__init__(**kwargs)
        self.depth = depth
        
    def call(self, x, mask=None):
         #Transforme le x en int 32 et creer un vecteur one hot encoded
        return tf.one_hot(tf.cast(x, tf.int32), self.depth)

In [14]:
class RNNModel(tf.keras.Model):
    
    def __init__(self, vocab_size):
        super(RNNModel, self).__init__()
        self.one_hot = OneHot(len(vocab))
        
    def call(self, inputs):
        output_layer = self.one_hot(inputs)
        return output_layer
    
batch_inputs, batch_targets = next(gen_batch(inputs, targets, seq_len=50, batch_size=batch_size)) #64 sequences, 50 elements
model = RNNModel(len(vocab))
output = model.predict(batch_inputs)[0][0]

print("Input letter :\n", batch_inputs[0][0])
print("Next letter prediction :\n", output)

Input letter :
 24.0
Next letter prediction :
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]


# --> Creation du model RNN

In [15]:
vocab_size = len(vocab)

def create_model():
  #Input layer
  #On ne set pas le nombre d'element dans les sequences
  tf_inputs = tf.keras.Input(shape=(None,), batch_size=batch_size)
  #One hot layer
  #En lui passant tf_inputs, on specifie la shape qu'on enverra dans la layer one_hot
  one_hot = OneHot(vocab_size)(tf_inputs) 
  print(one_hot)
  #LSTM layers
  #"return_sequences" permet de specifier que l'on prend en compte plusieurs des anciennes
  #cellules LSTM, si on met a False nous aurions que l'information de la derniere cellule LSTM
  #"stateful" permet de specifier qu'a chaque appel on ne va pas reinitialiser les cellules.
  #A chaque appel l'etat initial sera egal au dernier element de la sequence precedente
  if tpu_strategy is None:
    rnn_layer1 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(one_hot) 
    rnn_layer2 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=True)(rnn_layer1)
  else:
    rnn_layer1 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=False)(one_hot)    #Google Colab TPU
    rnn_layer2 = tf.keras.layers.LSTM(128, return_sequences=True, stateful=False)(rnn_layer1) #Google Colab TPU
  #Dense layer
  hidden_layer = tf.keras.layers.Dense(128, activation="relu")(rnn_layer2)
  #Output layer
  output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")(hidden_layer)
  model = tf.keras.Model(inputs=tf_inputs, outputs=output_layer)
  return model

if tpu_strategy is None: 
  model = create_model() #Local model
else:
  with tpu_strategy.scope(): 
    model = create_model() #TPU Google Colab Model

KerasTensor(type_spec=TensorSpec(shape=(32, None, 57), dtype=tf.float32, name=None), name='one_hot_1/one_hot:0', description="created by layer 'one_hot_1'")


In [16]:
#Reset les cellules du RNN
model.reset_states()

#Creer un premier batch
batch_inputs, target_inputs = next(gen_batch(inputs, targets, seq_len=50, batch_size=batch_size))

#Prediction pour un premier batch
print("Batch input shape : ", batch_inputs.shape)
outputs = model.predict(batch_inputs)
#Prediction de la premiere sortie
first_prediction = outputs[0][0]
print("First prediction :\n", first_prediction)

model.reset_states()

#Deuxieme prediction, c'est exactement la meme grace au stateful=True
outputs = model.predict(batch_inputs)
second_prediction = outputs[0][0]
print("Second prediction :\n", second_prediction)

#Check si les deux predictions sont egales avec un reset_state() entre les deux
assert(set(first_prediction)==set(second_prediction))

Batch input shape :  (32, 50)
First prediction :
 [0.0175288  0.01752265 0.01757541 0.01756948 0.01753305 0.01763293
 0.01752326 0.01750845 0.01749237 0.01763991 0.01756651 0.01748828
 0.01750534 0.01744902 0.01753117 0.01748647 0.017477   0.01747479
 0.01754663 0.01755825 0.01758766 0.01756385 0.01752383 0.01756142
 0.01756185 0.01748588 0.0174942  0.01756116 0.01754531 0.01750287
 0.01755021 0.01748655 0.01758426 0.01754696 0.01755339 0.01763955
 0.01755495 0.01752679 0.01754701 0.01753615 0.01752554 0.01758143
 0.01757491 0.01756363 0.01750428 0.01764829 0.017561   0.0175184
 0.01757929 0.01761205 0.01753645 0.01752489 0.01753761 0.01758648
 0.01750899 0.01756787 0.0175453 ]
Second prediction :
 [0.0175288  0.01752265 0.01757541 0.01756948 0.01753305 0.01763293
 0.01752326 0.01750845 0.01749237 0.01763991 0.01756651 0.01748828
 0.01750534 0.01744902 0.01753117 0.01748647 0.017477   0.01747479
 0.01754663 0.01755825 0.01758766 0.01756385 0.01752383 0.01756142
 0.01756185 0.01748588 0

In [17]:
if tpu_strategy is None:
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
  optimizer = tf.keras.optimizers.Adam(lr=0.001) #lr : learning rate
  train_loss = tf.keras.metrics.Mean(name="train_loss")
  train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
else:
  with tpu_strategy.scope():
    loss_object = tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    optimizer = tf.keras.optimizers.Adam(lr=0.001) #lr : learning rate
    train_loss = tf.keras.metrics.Mean(name="train_loss")
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

In [18]:
@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        #Fait une prediction sur le batch
        predictions = model(inputs)
        #Recupere l'erreur par rapport aux predictions faites
        if tpu_strategy is None:
          loss = loss_object(targets, predictions)
        else:
          loss = tf.reduce_sum(loss_object(targets, predictions))
    #Calcul du gradient
    gradients = tape.gradient(loss, model.trainable_variables)
    #Change les poids du model grace au gradient
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    #Garde l'information sur l'evolution de l'entrainement
    train_loss(loss)
    train_accuracy(targets, predictions)

@tf.function
def predict(inputs):
    # Fait une prediction sur tous le batch
    predictions = model(inputs)
    return predictions

# --> Entrainement du model

In [19]:
model.reset_states()

model.summary()

if tpu_strategy is None:
  for epoch in range(10000):
      #Pendant toute cette etape dans le for, on ne reinitialise pas les states
      for batch_inputs, batch_targets in gen_batch(inputs, targets, seq_len=100, batch_size=batch_size, noise=0): #Sequence de taille 100, batch de 64
          train_step(batch_inputs, batch_targets)
      template = '\r Epoch {}, Train Loss: {}, Train Accuracy: {}'
      print(template.format(epoch, 
                            train_loss.result(), 
                            train_accuracy.result()*100), end="")
      model.reset_states() #On reinitialise le state pour la prochaine epoch
else:
  with tpu_strategy.scope():
    for epoch in range(10000):
        #Pendant toute cette etape dans le for, on ne reinitialise pas les states
        for batch_inputs, batch_targets in gen_batch(inputs, targets, seq_len=100, batch_size=batch_size, noise=0): #Sequence de taille 100, batch de 64
            train_step(batch_inputs, batch_targets)
        template = '\r Epoch {}, Train Loss: {}, Train Accuracy: {}'
        print(template.format(epoch, 
                              train_loss.result(), 
                              train_accuracy.result()*100), end="")
        model.reset_states() #On reinitialise le state pour la prochaine epoch

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(32, None)]              0         
_________________________________________________________________
one_hot_1 (OneHot)           (32, None, 57)            0         
_________________________________________________________________
lstm (LSTM)                  (32, None, 128)           95232     
_________________________________________________________________
lstm_1 (LSTM)                (32, None, 128)           131584    
_________________________________________________________________
dense (Dense)                (32, None, 128)           16512     
_________________________________________________________________
dense_1 (Dense)              (32, None, 57)            7353      
Total params: 250,681
Trainable params: 250,681
Non-trainable params: 0
_______________________________________________________

KeyboardInterrupt: 

# --> Sauveguarde du model

In [None]:
import random

model.reset_states()

size_poetries = 300

poetries = np.zeros((batch_size, size_poetries, 1))
sequences = np.zeros((batch_size, 100))
for b in range(batch_size):
    rd = np.random.randint(0, len(inputs) - 100)
    sequences[b] = inputs[rd:rd+100]

for i in range(size_poetries+1):
    if i > 0:
        poetries[:,i-1,:] = sequences
    softmax = predict(sequences)
    # Set the next sequences
    sequences = np.zeros((batch_size, 1))
    for b in range(batch_size):
        argsort = np.argsort(softmax[b][0])
        argsort = argsort[::-1]
        # Select one of the strongest 4 proposals
        sequences[b] = argsort[0]

for b in range(batch_size):
    sentence = "".join([int_to_vocab[i[0]] for i in poetries[b]])
    print(sentence)
    print("\n=====================\n")