<a href="https://colab.research.google.com/github/lillaszulyovszky/data-science-retreat-lectures/blob/main/Language_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import glob
import random
import shutil
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras import models, layers
from tqdm import tqdm


In [2]:
# Download the dataset

In [3]:
!rm -rf dataset

# where the text files are going to live
dataset_path = "dataset"
dataset_path_all = os.path.join(dataset_path, "all")
dataset_path_train = os.path.join(dataset_path, "train")
dataset_path_valid = os.path.join(dataset_path, "valid")

# just use 20 files
file_number = 20

# gather the corpus if it has not been gathered yet
if not os.path.exists(dataset_path):

  # create all folders
  for path in [dataset_path, dataset_path_all, dataset_path_train, dataset_path_valid]:
    if not os.path.exists(path):
      os.mkdir(path)

  # clone the repo
  !git clone https://github.com/vilmibm/lovecraftcorpus

  # find all the files
  paths_all = glob.glob(os.path.join("lovecraftcorpus/*.txt"))

  # do not use all
  paths_all = paths_all[:file_number]

  # split 80/20
  split_index = int(len(paths_all) * 0.8)
  paths_train = paths_all[:split_index]
  paths_valid = paths_all[split_index:]

   # comment files
  def lilla_copy(paths, destination):
    for path in paths:
      shutil.copy2(path, destination)
  lilla_copy(paths_all, dataset_path_all)
  lilla_copy(paths_train, dataset_path_train)
  lilla_copy(paths_valid, dataset_path_valid)

  # delete repo
  !rm -rf lovecraftcorpus

  # done
  print("Corpus downloaded.")


Cloning into 'lovecraftcorpus'...
remote: Enumerating objects: 70, done.[K
remote: Total 70 (delta 0), reused 0 (delta 0), pack-reused 70[K
Unpacking objects: 100% (70/70), done.
Corpus downloaded.


In [4]:
batch_size = 32
seed = 42

def create_dataset(dataset_path):
  dataset = preprocessing.text_dataset_from_directory(
      dataset_path,
      labels=None,
      batch_size=batch_size,
      seed=seed
  )
  return dataset



dataset_original_all = create_dataset(dataset_path_all)
dataset_original_train = create_dataset(dataset_path_train)
dataset_original_valid = create_dataset(dataset_path_valid)


Found 20 files belonging to 1 classes.
Found 16 files belonging to 1 classes.
Found 4 files belonging to 1 classes.


In [5]:
dataset_original_all

<BatchDataset shapes: (None,), types: tf.string>

In [6]:
#dataset_original_all = pd.DataFrame(dataset_original_all)

In [7]:
#dataset_original_all

In [8]:
for batch in dataset_original_all:
  for sample in batch[:4]:
    sample = sample.numpy()
    print(sample[:200])

b'THE CRAWLING CHAOS\n\nOf the pleasures and pains of opium much has been written. The ecstasies and horrors of De Quincey and the paradis artificiels of Baudelaire are preserved and interpreted with an a'
b'THE THING ON THE DOORSTEP\n\nI\n\nIt is true that I have sent six bullets through the head of my best friend, and yet I hope to show by this statement that I am not his murderer. At first I shall be calle'
b'HERBERT WEST: REANIMATOR\n\nTo be dead, to be truly dead, must be glorious. There are far worse things awaiting man than death.--Count Dracula\n\nContents\n\nPart I: From the Dark\nPart II: The Plague-Daemon'
b'HE\n\nI saw him on a sleepless night when I was walking desperately to save my soul and my vision. My coming to New York had been a mistake; for whereas I had looked for poignant wonder and inspiration '


In [9]:
# create the vectorizer


In [10]:
vocabulary_size = 10000
encoder = layers.TextVectorization(
    max_tokens=vocabulary_size,
    standardize=None,
    split="whitespace",
    ngrams=5,
    output_mode="int"
)
# learning
encoder.adapt(dataset_original_all)

vocabulary = encoder.get_vocabulary()
print(f"Vocabulary size: {len(vocabulary)}")
print(f"Vocabulary: {vocabulary}")


Vocabulary size: 10000


In [11]:
# create the dataset for autoregression


In [12]:
sequence_length = 32

def create_dataset_for_autoregression(dataset):
  x_inputs = []
  y_outputs = []
  for batch in dataset:
    batch = encoder(batch).numpy()
    for sample in tqdm(batch):
        # pad at the beginning
        padding_token_id = 0
        padding = [padding_token_id] * sequence_length
        sample = padding + list(sample)

        # map all to input output pairs
        for start_index in range(0, len(sample) - sequence_length):
          x = sample[start_index:start_index + sequence_length]
          y = sample[start_index + sequence_length]
          if y == 0:
            break
          x_inputs += [x]
          y_outputs += [y]

  return tf.data.Dataset.from_tensor_slices((x_inputs, y_outputs))

dataset_train = create_dataset_for_autoregression(dataset_original_train)
dataset_valid = create_dataset_for_autoregression(dataset_original_valid)

100%|██████████| 16/16 [00:01<00:00,  9.92it/s]
100%|██████████| 4/4 [00:00<00:00,  5.04it/s]


In [13]:
dataset_train

<TensorSliceDataset shapes: ((32,), ()), types: (tf.int32, tf.int32)>

In [14]:
dataset_valid

<TensorSliceDataset shapes: ((32,), ()), types: (tf.int32, tf.int32)>

In [15]:
encoder.get_vocabulary()[1928]

'man who'

In [16]:
for input, output in dataset_train.take(10):
  print("Input:" " ".join([str(x) for x in input.numpy()]))
  print("Output:", output.numpy())
  print("")

0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0
Output: 1262

0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 1262
Output: 1

0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 1262Input: 1
Output: 1

0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 0Input: 

In [None]:
embedding_size = 128

model = models.Sequential()

#don't need textvectorizer anymore
model.add(layers.Embedding(vocabulary_size, embedding_size, input_length=sequence_length))
model.add(layers.LSTM(512, return_sequences=True))
model.add(layers.Dropout(0.4))
model.add(layers.LSTM(1024))#should be bigger than the first LSTM
model.add(layers.Dropout(0.5)) #grows from layer to layer
model.add(layers.Dense(vocabulary_size, activation="softmax")) # classifier that picks one word from 

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    dataset_train.shuffle(10000).batch(512), # batch size for speeding up things and rendering?
    validation_data=dataset_valid.batch(512),
    epochs=5
)


render_history(history)

model.summary()

Epoch 1/5
Epoch 2/5

In [None]:
import matplotlib.pyplot as plt

def render_history(history):
  plt.title("Losses")
  plt.plot(history.history["loss"], label="loss")
  plt.plot(history.history["val_loss"], label="val_loss")
  plt.legend()
  plt.show()
  plt.close()

  plt.title("Accuracies")
  plt.plot(history.history["accuracy"], label="accurary")
  plt.plot(history.history["val_accuracy"], label="val_accuracy")
  plt.legend()
  plt.show()
  plt.close()
  

In [None]:
import numpy as np

def decode(indices):
    return " ".join([vocabulary[index] for index in indices if vocabulary[index] is not ""])

def generate(model, seed_text, generated_sequence_length, temperature):

    input_sequence = encoder(seed_text).numpy()

    generated_sequence = list(input_sequence[::])

    # Pad.
    padding = [0] * (sequence_length - len(input_sequence))
    input_sequence = padding + list(input_sequence)

    # Generate the sequence by repeatedly predicting.
    while len(generated_sequence) < generated_sequence_length:
        prediction = model.predict(np.expand_dims(input_sequence, axis=0))
        predicted_index = get_index_from_prediction(prediction[0], temperature)
        generated_sequence.append(predicted_index)
        input_sequence = input_sequence[1:]
        input_sequence.append(predicted_index)

    # Convert the generated sequence to a string.
    text = decode(generated_sequence)
    print(text)
    print("")

        
def get_index_from_prediction(prediction, temperature=0.0):
    """ Gets an index from a prediction. """

    # Zero temperature - use the argmax.
    if temperature == 0.0:
        return np.argmax(prediction)

    # Non-zero temperature - do some random magic.
    else:
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction) / temperature
        exp_prediction= np.exp(prediction)
        prediction = exp_prediction / np.sum(exp_prediction)
        probabilities = np.random.multinomial(1, prediction, 1)
        return np.argmax(probabilities)
  

generate(model, "we are all doomed", 100, temperature=1.0)