## Shrek script generator

In [1]:
import keras
from keras import layers

import numpy as np
import random
import io

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Analysis and data preparation


In [3]:
!pip install pymupdf
import fitz  # PyMuPDF

def read_pdf(file_path):
    document = fitz.open(file_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Ścieżka do pliku PDF
file_path = '/content/drive/MyDrive/WUM/shrek_generator/shrek-script.pdf'
text = read_pdf(file_path)

start_index = text.find("Once upon a time there was a lovely")
end_index = text.rfind("THE END")

text = text[start_index:end_index]
print(f'Długość tekstu po przycięciu to {len(text)}')

unique_characters = sorted(set(text))

number_of_unique_characters = len(unique_characters)

(len(text), number_of_unique_characters, unique_characters[:10])

Collecting pymupdf
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from pymupdf)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.6 pymupdf-1.24.7
Długość tekstu po przycięciu to 71747


(71747, 78, ['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-'])

In [4]:
char_to_idx = dict((c, i) for i, c in enumerate(unique_characters))
idx_to_char = dict((i, c) for i, c in enumerate(unique_characters))

In [5]:
maxlen = 100
step = 8
input_sequences= []
output_chars = []
for i in range(0, len(text) - maxlen, step):
    input_sequences.append(text[i:i+maxlen] )
    output_chars.append( text[i+maxlen])
print("Number of sequences:", len(input_sequences))

Number of sequences: 8956


In [6]:
print(input_sequences[10], output_chars[10])
print("--------------")
print(input_sequences[11], output_chars[11])
print("--------------")
print(input_sequences[20], output_chars[20])
print(input_sequences[-1])

her of a fearful 
sort which could only be broken by 
love's first kiss. She was locked 
away in a c a
--------------
 fearful 
sort which could only be broken by 
love's first kiss. She was locked 
away in a castle gu a
--------------
 locked 
away in a castle guarded by a 
terrible fire-breathing dragon. 
Many brave knights had atte m
(as he's done singing and 
we fade to black) Oh, 
that's funny. Oh. Oh. I 
can't
breathe. I can't br


**Creating one-hot encoded vectors.**

In [7]:
x = np.zeros((len(input_sequences), maxlen, len(unique_characters)))
y = np.zeros((len(input_sequences), len(unique_characters)))
for i, sentence in enumerate(input_sequences):
    for j, char in enumerate(sentence):
      x[i, j, char_to_idx[char]] = 1
    y[i, char_to_idx[output_chars[i]]] = 1

In [8]:
x

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

## Building LSTM model

In [13]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

model = Sequential([
    LSTM(256, return_sequences=True, input_shape=(maxlen, len(unique_characters))),
    Dropout(0.2),
    LSTM(256, return_sequences=False),
    Dropout(0.2),
    Dense(len(unique_characters), activation='softmax')
])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 100, 256)          343040    
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_3 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense (Dense)               (None, 78)                20046     
                                                                 
Total params: 888398 (3.39 MB)
Trainable params: 888398 (3.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Model training.**

In [14]:
from keras.optimizers import Adam

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [15]:
import random
epochs = 25
batch_size = 32
generated_len=200


for epoch in range(epochs):
    model.fit(x, y, batch_size=batch_size, epochs=1)

    print("Epoch: %d" % epoch)

    generated = ""
    random_id = random.randint(0, len(text)-maxlen)
    sentence = text[random_id: random_id + maxlen]# choose randomly starting index and find part of a text of length "max_len"
    print('-------START :"' + sentence + '"')


    for i in range(generated_len):
            x_pred = np.zeros((1, maxlen, len(unique_characters)))
            for j, char in enumerate(sentence):
              x_pred[0, j, char_to_idx[char]] = 1

            preds = model.predict(x_pred, verbose=0)[0]

            next_index = np.argmax(preds) #find the index of the next character
            next_char = idx_to_char[next_index] # change index to character
            sentence = sentence[1:] + next_char #remove first character and add last
            generated += next_char

    print("Generate: ", generated)


Epoch: 0
-------START :"e 
way)
The arrow flies toward Donkey who jumps into Shrek's arms to 
get out of the way. The arrow "
Generate:                                                                                                                                                                                                          
Epoch: 1
-------START :"hat'll take longer. We can 
keep going.
FIONA
But there's robbers in the woods.
DONKEY
Whoa! Time ou"
Generate:   the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Epoch: 2
-------START :"
those fairy tale creatures!
FARQUAAD
Indeed. All right, ogre. I'll make 
you a deal. Go on this que"
Generate:   the 
I I the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe the 
I Nhe t