In [None]:
import os
import glob
import random
import shutil
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras import models, layers
from tqdm import tqdm

In [None]:
# Where the text files are going to live.
dataset_path = "dataset"
dataset_path_all = os.path.join(dataset_path, "all")
dataset_path_train = os.path.join(dataset_path, "train")
dataset_path_valid = os.path.join(dataset_path, "valid")

# Just use 20 files.
file_number = 30

# Gather the corpus if it has not been gathered yet.
if not os.path.exists(dataset_path):

    # Create all the folders.
    for path in [dataset_path, dataset_path_all, dataset_path_train, dataset_path_valid]:
        if not os.path.exists(path):
            os.mkdir(path)

    # Clone the repo.
    !git clone https://github.com/jblazzy/LOTR.git

    # Find all the files.
    paths_all = glob.glob("LOTR/*.txt")
    print(sorted(paths_all))

    # Do not use all.
    paths_all = paths_all[:file_number]

    # Split 80/20.
    split_index = int(len(paths_all) * 0.8)
    paths_train = paths_all[:split_index]
    paths_valid = paths_all[split_index:]

    # Copy files.
    def copy(paths, destination):
        for path in paths:
          book = ((path.split("/")[1]).split("-")[0])
          chap = ((path.split("/")[-1]))
          shutil.copy2(path, destination)
          os.rename(os.path.join(destination, chap), os.path.join(destination, book + chap))
    copy(paths_all, dataset_path_all)
    copy(paths_train, dataset_path_train)
    copy(paths_valid, dataset_path_valid)

    # Delete repo.
    !rm -rf LOTR

    # Done.
    print("Corpus downloaded.")



Cloning into 'LOTR'...
remote: Enumerating objects: 437, done.[K
remote: Total 437 (delta 0), reused 0 (delta 0), pack-reused 437[K
Receiving objects: 100% (437/437), 38.23 MiB | 13.08 MiB/s, done.
Resolving deltas: 100% (249/249), done.
['LOTR/fellowship.txt', 'LOTR/hobbit.txt', 'LOTR/negative-words.txt', 'LOTR/positive-words.txt', 'LOTR/return.txt', 'LOTR/silmarillion.txt', 'LOTR/twotowers.txt']
Corpus downloaded.


In [None]:
!ls dataset
print("")
!ls dataset/all
print("")
!ls dataset/train
print("")
!ls dataset/valid
!pwd

all  train  valid

fellowship.txtfellowship.txt  positivepositive-words.txt	twotowers.txttwotowers.txt
hobbit.txthobbit.txt	      return.txtreturn.txt
negativenegative-words.txt    silmarillion.txtsilmarillion.txt

fellowship.txt	hobbit.txt  return.txt	silmarillion.txt

twotowers.txt
/content


#Prepare Datasets

In [None]:
batch_size = 32
seed = 42

def create_dataset(path):
  dataset = preprocessing.text_dataset_from_directory(
      path,
      labels = None,
      batch_size = batch_size,
      seed = seed
  )
  return dataset

dataset_original_all = create_dataset(dataset_path_all)
dataset_original_train = create_dataset(dataset_path_train)
dataset_original_valid = create_dataset(dataset_path_valid)


Found 7 files belonging to 1 classes.
Found 4 files belonging to 1 classes.
Found 1 files belonging to 1 classes.


In [None]:
for batch in dataset_original_all:
  for sample in batch[:4]:
    sample = sample.numpy()
    print(sample[:200], "...")
    print(len(sample), "bytes")
    print("")

b'Chapter 11. A Knife in the Dark \n\n\nAs they prepared for sleep in the inn at Bree, darkness lay on \nBuckland; a mist strayed in the dells and along the river -bank. The house at \nCrickhollow stood sile' ...
51113 bytes

b'Chapter 10\n\n\nTHE VOICE OF SARUMAN \n\n\nThey passed through the ruined tunnel and stood upon a \nheap of stones, gazing at the dark rock of Orthanc, and its \nmany windows, a menace still in the desolation' ...
51916 bytes

b'\nOf the Return of the Noldor \n\nIt has been told that Feanor and his sons came first of \nthe Exiles to Middle-earth, and landed in the waste of \nLammoth, the Great Echo, upon the outer shores of the \nF' ...
30877 bytes

b'\n\nOVER HILL AND UNDER HILL \n\n\nThere were many paths that led up into those mountains, and many passes over them. But most of the \npaths were cheats and deceptions and led nowhere or to bad ends; and m' ...
22601 bytes

b'\n\n\nNow all roads were running together to the East to meet the coming of \nwar and the

In [None]:
vocab_size = 21000

encoder = layers.TextVectorization(
    max_tokens = vocab_size,
    standardize = "lower_and_strip_punctuation",
    split = "whitespace",
    output_mode = "int"
)

encoder.adapt(dataset_original_all)

vocab = encoder.get_vocabulary()

print(f"Vocab Size: {len(vocab)}")
print(f"Vocab S: {vocab}")


Vocab Size: 21000


In [None]:
sequence_length = 64

def create_dataset_for_autoregression(dataset):
  x_inputs = []
  y_outputs = []
  for samples in dataset: #get bach should be all books
    samples = encoder(samples).numpy()
    for sample in tqdm(samples):# go throough each book
      padding_token_id = vocab.index("")
      padding = [padding_token_id] * sequence_length
      sample = padding + list(sample)

      #map to input output pairs
      for start_index in range(0, len(sample) - sequence_length):
        x = sample[start_index:start_index + sequence_length]
        y = sample[start_index + sequence_length]
        x_inputs += [x]
        y_outputs += [y]

  return tf.data.Dataset.from_tensor_slices((x_inputs, y_outputs))

dataset_train = create_dataset_for_autoregression(dataset_original_train)
dataset_valid = create_dataset_for_autoregression(dataset_original_valid)

100%|██████████| 32/32 [00:01<00:00, 25.23it/s]
100%|██████████| 32/32 [00:02<00:00, 11.56it/s]
100%|██████████| 15/15 [00:00<00:00, 58.39it/s]
100%|██████████| 20/20 [00:01<00:00, 19.43it/s]


In [None]:
def decode(indices):
    return " ".join([vocab[index] for index in indices if vocab[index] is not ""])

for input, output  in dataset_train.take(20):
    print("input: ", " ".join([str(x) for x in input.numpy()]))
    print("output:", output.numpy())
    print("input decoded: ", decode(input))
    print("output decoded:", decode([output]))
    print("")

  return " ".join([vocab[index] for index in indices if vocab[index] is not ""])


input:  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
output: 4
input decoded:  
output decoded: of

input:  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4
output: 2
input decoded:  of
output decoded: the

input:  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 2
output: 1534
input decoded:  of the
output decoded: sindar

input:  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 2 1534
output: 36
input decoded:  of the sindar
output decoded: now

input:  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 2 1534 36
output: 20
input decoded:  of the sindar now
output decoded: as

input:  0 0 0 0 0 0 0 0 0 0 0

In [None]:
import matplotlib.pyplot as plt

def render_history(history):
    plt.title("Training loss vs. validation loss")
    plt.plot(history.history["loss"], label="loss")
    plt.plot(history.history["val_loss"], label="val_loss")
    plt.legend()
    plt.show()
    plt.close()

    plt.title("Training accuracy vs. validation accuracy")
    plt.plot(history.history["accuracy"], label="accuracy")
    plt.plot(history.history["val_accuracy"], label="val_accuracy")
    plt.legend()
    plt.show()
    plt.close()

#LSTM

In [None]:

embedding_size = 128

model = models.Sequential()
model.add(layers.Embedding(vocab_size, embedding_size, input_length=sequence_length))
model.add(layers.Dropout(0.3))
model.add(layers.LSTM(512, return_sequences=True))
model.add(layers.Dropout(0.4))
model.add(layers.LSTM(1024))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(vocab_size, activation="softmax"))
model.summary()

model.compile(
    optimizer = "adam",
    loss = "sparse_categorical_crossentropy",
    metrics=["accuracy"]
)
history = model.fit(
     dataset_train.shuffle(10000).batch(128),
     epochs = 3,
     validation_data = dataset_valid.batch(512)
 )

render_history(history)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 128)           2688000   
                                                                 
 dropout (Dropout)           (None, 64, 128)           0         
                                                                 
 lstm (LSTM)                 (None, 64, 512)           1312768   
                                                                 
 dropout_1 (Dropout)         (None, 64, 512)           0         
                                                                 
 lstm_1 (LSTM)               (None, 1024)              6295552   
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 21000)             2

KeyboardInterrupt: ignored

In [None]:
import numpy as np

def generate(model, seed_text, generated_sequence_length, temperature):

    input_sequence = encoder(seed_text).numpy()

    generated_sequence = list(input_sequence[::])

    # Pad.
    padding = [0] * (sequence_length - len(input_sequence))
    input_sequence = padding + list(input_sequence)

    # Generate the sequence by repeatedly predicting.
    while len(generated_sequence) < generated_sequence_length:
        prediction = model.predict(np.expand_dims(input_sequence, axis=0))
        predicted_index = get_index_from_prediction(prediction[0], temperature)
        generated_sequence.append(predicted_index)
        input_sequence = input_sequence[1:]
        input_sequence.append(predicted_index)

    # Convert the generated sequence to a string.
    text = decode(generated_sequence)
    print(text)
    print("")


def get_index_from_prediction(prediction, temperature=0.0):
    """ Gets an index from a prediction. """

    # Zero temperature - use the argmax.
    if temperature == 0.0:
        return np.argmax(prediction)

    # Non-zero temperature - do some random magic.
    else:
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction) / temperature
        exp_prediction= np.exp(prediction)
        prediction = exp_prediction / np.sum(exp_prediction)
        probabilities = np.random.multinomial(1, prediction, 1)
        return np.argmax(probabilities)


generate(model, "Gandalf the wizard", 16, temperature=0.2)

gandalf the wizard and the ring of the world and the ring of the world and



In [None]:
model.save("./models/model")

In [None]:
from tensorflow.keras.models import load_model

generation_model = load_model("./models/model")

In [None]:
generate(generation_model, "Gandalf met with frodo and sam on their way to Mordor", 20, temperature=0.4)

NameError: ignored