In [1]:
import re

import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.layers import LSTM,Dense,Embedding,Dropout,GRU

### 1. Load Shakespeare book

It is a large corpus of text, it is highly recommended to have at least one source of a million characters to get a realistic text generation.

The text is well structured. It's in form of:

$Character_X$: $Speech_X$



In [2]:
with open('shakespeare.txt', 'r') as f:
  text = f.read()

In [3]:
text[:100]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose mi"

### 2. Preprocessing

- Remove big spaces
- Word2vect: assign numbers to each character
  - Ceate first dict that can switch between a numerical index to a character 
  - Create second dict and a character to a numerical index.

#### 2.1 Remove big spaces

In [4]:
# text = re.sub(r"\s+"," ", text)

text[:100]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose mi"

##### 2.2 Create a mapping dicts

In [5]:
# Sorted and unique set of characters
vocab = sorted(set(text))
vocab_size = len(vocab)

print(len(vocab), vocab[:6])

char_to_index = {character: nb for nb, character in enumerate(vocab)}
index_to_char = np.array(vocab) # dict(map(reversed, char_to_index.items())) #

84 ['\n', ' ', '!', '"', '&', "'"]


#### 2.3 Encode the text

In [6]:
encoded_text = np.array([char_to_index[char] for char in text])

start, end = 52, 148
print(f"Original text:\n{text[start: end]}\n")
print(f"Encoded text: \n{encoded_text[start: end]}\n")


Original text:
desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time

Encoded text: 
[59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63 56 75  1
 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60  1 68 64
 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1 56 74  1
 75 63 60  1 73 64 71 60 73  1 74 63 70 76 67 59  1 57 80  1 75 64 68 60]



## 3. Create batchs

1. Find a proper lenght for the sequence
  - Too short a sequence: we will not have enough information 
  - Too long a sequence and the training will take too long and risk over-training

3.1 Lenght for the sequence

In [7]:
seq_len = 120 # int(np.mean([len(sentence) for sentence in text.split(',')])) * 3 # Around 3 sentences
total_num_seq = len(text) // (seq_len + 1) # 1: Because the index starts from 

seq_len, total_num_seq

(120, 45005)

3.2 Creating sequences

In [8]:
batch_size, buffer_size = 128, 10000
# buffer_size: shuffle 10000 elements in your dataset
# useful for memory when we are dealing with big dataset

def create_seq_targets(seq):
  """
  seq: Hello my name is Celia
  input_txt: Hello my name is Celi
  target_txt: ello my name is Celia
  """
  input_txt = seq[:-1]
  target_txt = seq[1:]

  return input_txt, target_txt

def display_xy(dataset):

  for X, y in dataset.take(1):
    print(f"Input_X:\n{X}\n")
    print(f"type(Input_X):{type(X)}\n")
    print(f"Input_X:\n{''.join(index_to_char[X.numpy()])}\n\n")

    print(f"Output_y:\n{y}\n")
    print(f"type(Output_y):{type(y)}\n")
    print(f"Output_y:\n{''.join(index_to_char[y.numpy()])}\n")


In [9]:
# Creating training sequences

# Convertir un vecteur de texte en un flux d'indices de caractères.
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

print(f"type(char_dataset): {type(char_dataset)}")

for index, item in enumerate(char_dataset.take(500)):
  print(item.numpy())
  if index > 2: 
    break
    
# drop_remainder: drop_remainder: (Optional.) 
# A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements;
# the default behavior is not to drop the smaller batch.
sequences = char_dataset.batch(seq_len + 1, drop_remainder=True)

type(char_dataset): <class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>
0
1
1
1


3.3 Creating input seq and target output

In [10]:
dataset = sequences.map(create_seq_targets)

#display_xy(dataset)

3.4 Create the batches

In [11]:
dataset = dataset.shuffle(buffer_size).batch(batch_size, 
                                             drop_remainder=True)

# tuple 1: input (batch_size, seq_length)
# tuple 2: target (batch_size, seq_length)
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

## 4. Creating the model

Use sparse categorical crossentropy when your classes are mutually exclusive (e.g. when each sample belongs exactly to one class) and categorical crossentropy when one sample can have multiple classes or labels are soft probabilities (like [0.5, 0.3, 0.2]).

In [12]:
# The vocab is the number of unique words in the vocabulary
vocab_size = len(vocab)

# Size of embedding
embed_dim = 64 # ~ < vocab_size

# Size of RNN
rnn_neurons = 1026

# Epochs

epochs = 5

vocab_size

84

#### PARAMETERS OF THE EMBEDDING LAYER ---

**input_dim** = the vocab size that we will choose. In other words it is the number of unique words in the vocab.

**output_dim** = the number of dimensions we wish to embed into. Each word will be represented by a vector of this much dimensions.

**input_length** = lenght of the maximum document. which can be stored in max_len

In our case:

`dataset = <BatchDataset shapes: ((128, 116), (128, 116)), types: (tf.int64, tf.int64)>`

In [13]:
def create_model(vocab_size, embed_dim, batch_size):
  
  model = Sequential()
  model.add(Embedding(vocab_size, embed_dim, batch_input_shape= [batch_size, None]))
  model.add(GRU(rnn_neurons,return_sequences=True, 
                stateful=True,
                recurrent_initializer='glorot_uniform'))
  # Couche Finale Dense de Prédiction
  model.add(Dense(vocab_size))
  model.compile(optimizer='adam', loss=lambda y_true, y_pred: sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)) 
  return model

In [14]:
model = create_model(vocab_size, embed_dim, batch_size)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1026)         3361176   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


## Fit the model

In [15]:
for input_example_batch, target_example_batch in dataset.take(1):

  # Prédire sur un lot aléatoire
  example_batch_predictions = model(input_example_batch)

  # Afficher les dimensions des prédictions
  print(example_batch_predictions.shape, " <=== (batch_size, sequence_length, vocab_size)")
  # Probability for each char
  print(example_batch_predictions[0].shape)

(128, 120, 84)  <=== (batch_size, sequence_length, vocab_size)
(120, 84)


In [16]:
# Prendre un échantillon aléatoirement avec une log proba

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

# squeeze: Removes dimensions of size 1 from the shape of a tensor.
alea_char = index_to_char[tf.squeeze(sampled_indices, axis=1).numpy()]

In [17]:
# model.fit(dataset, epochs=epochs)

In [18]:
# Save the model 
# model.save("shakespeare_gen.h5")

In [19]:
# Load the model

model2 = create_model(vocab_size, embed_dim, batch_size=1) 
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 64)             5376      
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1026)           3361176   
_________________________________________________________________
dense_1 (Dense)              (1, None, 84)             86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [20]:
model2.load_weights('shakespeare_gen.h5')
model2.build(tf.TensorShape([1, None]))

model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 64)             5376      
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1026)           3361176   
_________________________________________________________________
dense_1 (Dense)              (1, None, 84)             86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [21]:
def generate_text(model, 
                  start_text, 
                  generate_size=500, 
                  temperature=1.0):
  
  text_generated = []

  input = [char_to_index[s] for s in start_text]

  # To fit with the batch size, cause we are using batch of size 1
  input = tf.expand_dims(input, 0)


  model.reset_states()

  for t in range(generate_size):
    predictions = model(input)
    predictions = tf.squeeze(predictions, 0)
    
    predictions = predictions / temperature

    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

    input = tf.expand_dims([predicted_id], 0)

    text_generated.append(index_to_char[predicted_id])

  return f"{start_text}{''.join(text_generated)}"


In [22]:
generate_text(model2, 'JULIET', generate_size=100)

"JULIETTIMER. There's old Nor God-here purpose; and what try\n    For this exceptance gave away myself, I\n  "