# Poetry Notebook

In this notebook we will be implementing GPT to generate text based on the work of Edgar Allan Poe.

In [12]:
# Installing dependencies
!pip install tiktoken
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Downloading dataset from the GitHub
!wget https://raw.githubusercontent.com/kocenko/Poetry-Synthesis/dev/data/poe_data.txt

--2023-05-16 06:10:37--  https://raw.githubusercontent.com/kocenko/Poetry-Synthesis/dev/data/poe_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1930488 (1.8M) [text/plain]
Saving to: ‘poe_data.txt’


2023-05-16 06:10:37 (260 MB/s) - ‘poe_data.txt’ saved [1930488/1930488]



In [4]:
# Reading and file and displaying number of characters
### (Option) We can use different data to train it on

with open("poe_data.txt", encoding="utf-8") as f:
  text = f.read()

print("Number of characters in the set: {}".format(len(text)))

Number of characters in the set: 1905067


In [24]:
# Setting up the tokenizer
### (Option 1) We can use different tokenizer, like SentencePiece
### (Option 2) We can build our own tokenizer, using huggingface library

import tiktoken

enc = tiktoken.get_encoding("cl100k_base")
assert text[:10] == enc.decode(enc.encode(text[:10])), "There is some problem with tiktoken tokenizer"

In [26]:
# Encoding whole data and representing as a tensor
import tensorflow as tf
import numpy as np

data = tf.constant(enc.encode(text), dtype=tf.int32)

In [28]:
# Train, Val split
### (Option) Different split, test data?

split_ratio = 0.85
split_idx = int(split_ratio * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

In [42]:
# Preparing dataloader
### (Option) What if the context affects not the following
###          but the one after the following token? (bigger offset)

def get_batch(split: str, batch_size: int, context_length: int, offset: int):
  data = train_data if split == "train" else val_data
  shifts = tf.random.uniform((batch_size, ), 0, len(data) - context_length, tf.int32)
  X = tf.stack([data[shift: shift + context_length] for shift in shifts])
  y = tf.stack([data[offset + shift:shift + context_length + offset] for shift in shifts])
  return X, y


context_length = 8
offset = 1
batch_size = 4
X, y = get_batch('train', batch_size, context_length, offset)