Library imports

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import regex as re
import numpy as np
import pandas as pd
import os

Class Definitions

In [12]:
class FeedForward(nn.Module):
    
    def __init__(self):
        super(FeedForward, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(n_hidden, 2 * n_hidden),
            nn.ReLU(),
            nn.Linear(2 * n_hidden, n_hidden),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)

In [13]:
class Head(nn.Module):
    
    def __init__(self):
        super(Head, self).__init__()
        self.q = nn.Linear(n_hidden, head_size)
        self.k = nn.Linear(n_hidden, head_size)
        self.v = nn.Linear(n_hidden, head_size)
        
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
        
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)
        
        out = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
        out = out.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        out = F.softmax(out, dim = -1)
        out = self.dropout(out)
        out = out @ v
        
        return out

In [14]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.heads = nn.ModuleList([Head() for _ in range(n_heads)])
        self.proj = nn.Linear(head_size * n_heads, n_hidden)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

In [15]:
class Block(nn.Module):
    
    def __init__(self):
        super(Block, self).__init__()
        self.sa = MultiHeadAttention()
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_hidden)
        self.ln2 = nn.LayerNorm(n_hidden)
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [16]:
class GPTLanguageModel(nn.Module):
    
    def __init__(self):
        super(GPTLanguageModel, self).__init__()
        self.tokens = nn.Embedding(tokenizer.vocab_size, n_hidden)
        self.positions = nn.Embedding(block_size, n_hidden)
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_hidden)
        self.lm_head = nn.Linear(n_hidden, tokenizer.vocab_size)
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            
    def forward(self, idx, targets = None):
        B, T = idx.shape
        x = self.tokens(idx) + self.positions(torch.arange(T, device = device))
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            
        return logits,loss
    
    def generate(self, idx):
        model.eval()
        
        while True:
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            
            if idx_next == tokenizer.special_tokens["</Poem>"]:
                break
            
            idx = torch.cat((idx, idx_next), dim = 1)
            
        model.train()
        return idx

In [17]:
class RegexTokenizer():
    
    
    def __init__(self, pattern = None):
        self.merges = {}
        self.vocab = {idx: bytes([idx]) for idx in range(256)}
        self.vocab_size = 256
        self.pattern = pattern
        self.compiled_pattern = re.compile(self.pattern)
        self.special_tokens = {}
        self.inverse_special_tokens = {}
        
        
    def register_special_tokens(self, special_tokens):
        
        for token in special_tokens:
            print(token)
            self.special_tokens[token] = self.vocab_size
            self.vocab[self.vocab_size] = token.encode("utf-8")
            self.vocab_size += 1
            
        self.inverse_special_tokens = {v: k for k, v in self.special_tokens.items()}
        
        
    def get_stats(self, ids, counts = None):
        counts = {} if counts is None else counts
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts
    
    
    def merge(self, ids, pair, idx):
        
        newids = []
        i = 0
        
        while i < len(ids):
            if i < len(ids) - 1 and pair[0] == ids[i] and pair[1] == ids[i + 1]:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        
        return newids
    
    
    def train(self, text, vocab_size, verbose = False):
        
        chunks = re.findall(self.compiled_pattern, text)
        
        ids = [list(chunk.encode("utf-8")) for chunk in chunks]
        
        while self.vocab_size < vocab_size:
            
            stats = {}
            
            for chunk in ids:
                self.get_stats(chunk, stats)
                
            pair = max(stats, key = stats.get)
            
            ids = [self.merge(chunk, pair, self.vocab_size) for chunk in ids]
            
            self.merges[pair] = self.vocab_size
            self.vocab[self.vocab_size] = self.vocab[pair[0]] + self.vocab[pair[1]]
            
            if verbose:
                print(f"merging {self.vocab[pair[0]], self.vocab[pair[1]]} -> {self.vocab_size}")
                
            self.vocab_size += 1
            
            
    def decode(self, ids):
        
        text_bytes = b"".join(self.vocab[idx] for idx in ids)
        text = text_bytes.decode("utf-8", errors = "replace")
        return text
    
    
    def _encode_chunk(self, chunk_bytes):
        ids = list(chunk_bytes)
        
        while len(ids) >= 2:
            stats = self.get_stats(ids)
            pair = min(stats, key = lambda p: self.merges.get(p, float("inf")))
            
            if pair not in self.merges:
                break
            ids = self.merge(ids, pair, self.merges[pair])
            
        return ids
    
    def encode(self, text):
        chunks = re.findall(self.compiled_pattern, text)
        
        ids = []
        for chunk in chunks:
            chunk_encoded = self._encode_chunk(chunk.encode("utf-8"))
            ids.extend(chunk_encoded)
            
        return ids

Setting up data and tokenizer

In [18]:
data = pd.read_csv("/kaggle/input/poetry-foundation-poems/PoetryFoundationData.csv")

In [19]:
text = ""
for i in range(len(data)):
    text += data.Title[i]
    text += data.Poem[i]

In [None]:
tokenizer_text = text[48 * len(text) // 100:52 * len(text) // 100]
tokenizer = RegexTokenizer(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")
tokenizer.train(tokenizer_text, 1000, verbose = True)

In [21]:
# RUN ONCE
tokenizer.register_special_tokens(["<Title>", "</Title><Poem>", "</Poem>"])

<Title>
</Title><Poem>
</Poem>


In [28]:
tokenizer.vocab_size, tokenizer.vocab[1002]

(1003, b'</Poem>')

In [34]:
# visualization of encoding/decoding
ids = [1000]
ids.extend(tokenizer.encode("What is that melody!"))
ids.append(1001)
decoded = []
for i in range(len(ids)):
    ids[i] = tokenizer.vocab[ids[i]].decode("utf-8")
"/".join(ids)

'<Title>/What/ is/ that/ me/l/od/y/!/</Title><Poem>'

Model hyperparameters

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_hidden = 1024
block_size = 128
n_layers = 6
n_heads = 8
head_size = 128
dropout = 0.1
learning_rate = 3e-4
total_steps = 5000
batch_size = 64

device

device(type='cuda')

Helper functions

In [35]:
def build_dataset():
    text = []
    for i in range(len(data)):
        text.append(tokenizer.special_tokens["<Title>"])
        text.extend(tokenizer.encode(data.Title[i]))
        text.append(tokenizer.special_tokens["</Title><Poem>"])
        text.extend(tokenizer.encode(data.Poem[i]))
        text.append(tokenizer.special_tokens["</Poem>"])
        
    train_text = text[:8 * len(text) // 10]
    test_text = text[8 * len(text) // 10:]
    
    return train_text, test_text

In [36]:
def get_batch(split):
    X, Y = [], []
    
    split_text = {
        "train": train_text,
        "test": test_text,
    }[split]
    
    ix = torch.randint(0, len(split_text) - 128 - 1, (batch_size,))
    
    for idx in ix:
        X.append(split_text[idx: idx + 128])
        Y.append(split_text[idx + 1: idx + 128 + 1])
    
    return torch.tensor(np.array(X)).to(device), torch.tensor(np.array(Y)).to(device)

In [194]:
def train():
    model.train()
    
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate / 3) # take off / 3 for fresh models, i am only doing this for finer training/tuning
    
    for i in range(total_steps):
        
        xb, yb = get_batch("train")
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none = True)
        loss.backward()
        optimizer.step()
        
        if (i + 1) % 10 == 0:
            print("Step[{}/{}], Loss: {:.4f}".format(i + 1, total_steps, loss.item()))

In [191]:
def generatePoem(context):
    
    model.eval()
    # all poems start with all this extra formatting so this is needed in order for the title section to be perceived by the model as being in distribution and not an anomaly
    title = "\r\r\n                    " + context + "\r\r\n                "
    model_context = [tokenizer.special_tokens["<Title>"],]
    model_context.extend(tokenizer.encode(title))
    model_context.append(tokenizer.special_tokens["</Title><Poem>"])
    context_encoded = torch.tensor(model_context).view(1, -1).to(device)
    response = np.array(model.generate(context_encoded).to("cpu"))[0]
    
    response_length = len(response)
    i = 0
    while i < response_length:
        # a little bit of surgery to cut out all special tokens (ids >= 1000) in the response
        if response[i] >= 1000:
            response = np.delete(response, i)
            response_length -= 1
        i += 1
    
    print(tokenizer.decode(response))

In [39]:
tokenizer.special_tokens

{'<Title>': 1000, '</Title><Poem>': 1001, '</Poem>': 1002}

In [40]:
train_text, test_text = build_dataset()

Initializing and training model

In [69]:
model = GPTLanguageModel().to(device)
print(sum(p.numel() for p in model.parameters())/1e6, "million parameters")

52.587499 million parameters


In [195]:
train()

Step[10/5000], Loss: 3.0445
Step[20/5000], Loss: 3.0853
Step[30/5000], Loss: 3.0711
Step[40/5000], Loss: 2.9821
Step[50/5000], Loss: 3.0194
Step[60/5000], Loss: 3.0725
Step[70/5000], Loss: 3.0739
Step[80/5000], Loss: 3.0743
Step[90/5000], Loss: 3.0958
Step[100/5000], Loss: 3.0325
Step[110/5000], Loss: 2.9998
Step[120/5000], Loss: 3.0752
Step[130/5000], Loss: 2.9962
Step[140/5000], Loss: 3.0899
Step[150/5000], Loss: 3.0040
Step[160/5000], Loss: 3.0007
Step[170/5000], Loss: 3.0323
Step[180/5000], Loss: 2.9382
Step[190/5000], Loss: 3.0079
Step[200/5000], Loss: 2.9813
Step[210/5000], Loss: 2.9960
Step[220/5000], Loss: 3.0758
Step[230/5000], Loss: 2.9997
Step[240/5000], Loss: 3.1051
Step[250/5000], Loss: 3.0757
Step[260/5000], Loss: 3.0972
Step[270/5000], Loss: 3.0883
Step[280/5000], Loss: 3.0379
Step[290/5000], Loss: 3.0271
Step[300/5000], Loss: 3.0428
Step[310/5000], Loss: 3.0532
Step[320/5000], Loss: 2.9830
Step[330/5000], Loss: 3.0072
Step[340/5000], Loss: 3.0631
Step[350/5000], Loss: 2

Testing model on sample prompts

In [233]:
generatePoem("How To Download With No Disk Space")


                    How To Download With No Disk Space
                
Take tell themselves—but police themselves
With military course, and the course
It would not exit them.
 
Not a good man, not a good man,
Each one with a career like a bee
Above my feet, links his left leg of legs.
You aspire to what’s enough
One by one. Each given a bouquet
And say goodbye.
 
The way they got them must by, both made them from
The forgotten except for me.
Then the kid didn’t exactly, ‘You,’&
On the Rockyak, as the Isle of Obient Blood Market.
When it took to the square station
They climbed the restaurant. When it had established:   
‘What will we have to want?’ He got
‘Spoken! If the Past is dear.’
‘Though it was I wouldn’t have to say, ‘I can’t.
You would know which to watch and
The Aristotle, if rather you   
Symphonies whether they,
Feel up one cabinet or fish or to please or to blame
A way: ‘Pop or Gratural poet!’)
I have no capbacks, no trees,
With lines, grass, or blood, and then read
The Bl

In [201]:
generatePoem("The Boy Who Lived, Come To Die!") # take 1


                    The Boy Who Lived, Come To Die!
                
“So we escaped.”
So oyster orphans shooted
and who came to climb black
and a crush of rabid pods, mother
from one poodle to the easel
Pulsing a dream in the next b.dad drew
more wedding, then yelled Rhetoric, my grandfather’s back
and when he sat down on Mesopotamis decided
I got tired in this bag
as he told his mother knew caught
so he said he. But just enough.
Still, he started, his body laughed
and said he waited for his children.
I announced him'd get them.
Dad said he was a human son,
he was a class, a woman, a small,
a prisoner and a wave clucking
in the front of a naked man,
giving a brand new tarp,
and
talk to Smith in his table—
no: lift up my legs
a safe leaf and fall.
Space was a big mouse.
They found the world
and the rest were alive
beside me. The boy
wrang to engirt boar.
I have seen those who
were at the three farmer’s table
in the back of a car
and, so as to California, Whitman, on the Street Margin,


In [193]:
generatePoem("The Boy Who Lived, Come To Die!") # take 2


                    The Boy Who Lived, Come To Die!
                



1
—Chapter 26



She was bloody, the lover never called,
And all of these the broken houses as she lived, in the garden,
She mutted, with wounded left, evicted slits, as she walked through

Whenever to walk the barrel or marvelous little multiply

I went tofore with the idea. The Fumes clang like to Billity

shiven pecks Billow, and Joseph said: 'Harvesters, quickens

New monads on Joe's face; this she be thrived

As everyday belongs for her to celebrate with any other,
Even Louisia she cut from Fortono Samo is holy

(I called of Samo’s Vietnamono Physic)

My location in the way Labora writes and writhes writings.•

O saint, friendly homer ritual?

Begun

 

Haitella •

I was writing down the early

• •

 

What's not sure if the whole •

Yours • mine •

when watching

your kids

you are stoping,

I hope

if we’re yet getting

because you left to be

a hygriphic melody

every day and sun

you marry •

 

dinoxy me

The stuff below can be ignored but I left it anyways because it gives another example of poem generation

<h1>Generation Record:</h1>
- model with no special tokens, loss: 3.5671, dropout 0.2, 2k training steps, 3e-4 constant learning rate, context = "How many more times are needed?":
<br/>"And at the st fades alongs,   
Most the trees read God’s venom.   
He blazes its dread as to be read to’s grove?   
Gainting a new dust’s corpselve hot feet until he is seen   
On the goon place were darked across The Clay,
Nor blow through a Hw settled field.
Himmering cries his sweat.
With news were slopes into the scream,   
With each furnace in sand in the news and candle!
Sept to the best of the air which in Sandring   
Her night declines in the Languis of Columbus,   
As the rock is a light of collectors
And by the Trymina out at the Winden,
Gream the State, by a cottage   
Her linger and the sight of itself like the onyard—   
Present will read for the tray—   
The distance inward thoughtres,“The diaster, white peace-echnight
Change with one shall glorify in the preserve?
Oh, I will never know and that I had a long,   
Cless you “Your man’s name!’ converse! Pone’s nose his   
And I Come to schoolm hell ate in these humans,
Nothing but this struther shouldn’t home,
Bad may go on to wish those shoes;   
No a snarch-set row. He is like a quiet rattle   
Sorrowing always feel fit the waves.
I nought them horse. He’s still wet.
Out of us teach history to
We are the weather's snow. The father says,   
Even another moving, it is to follow that.   
The deal can beeside them to be complain.   
You must move that that we should see him look   
My stand will keep me around   
The waves and weep,   
Their graves are of our Death   
In their show and does billbows me ambiable,   
And we have those dominion of the Names   
Even snows, hair visited his side beam,   
The imagine wife of our Field. And he them, his specklife   
Her suddenness is the Armisto in the God’s Subau, let his mother's same chose   
Death and her and his own song—my priest?   
Ne hope? He must consider him death, we will   
Over that each graceful on the Life of his cannot see. Be always didn’t helphe undiminently so.
Decrets them as coming. Had he did she taught us?
He said with him and he wants sank, he went on with all,
See his shrivabor’s nodding blood, but he could have no mein.   
He’d like him told us what it’s repeat,   
You never was soon again. See what they should   
To ever go with him, therefore the sum9thwhile I knew,
When I shall not be saved from the cloyest man was strength,   
Are his capture and he was clear;
Which were my power he had said,   
What learned her from ‘But I’m knew
The Barret Earth Obs’: “Here, a queen!” Dad,
“Who creeps the highway where vere of evening!”   
Paradise speckle, and with a body to believe of
Lady to a self, to sleep, working but there,   
Too good, o"

In [97]:
generatePoem("How many more times are needed?")

How many more times are needed?   
And at the st fades alongs,   
Most the trees read God’s venom.   
He blazes its dread as to be read to’s grove?   
Gainting a new dust’s corpselve hot feet until he is seen   
On the goon place were darked across The Clay,
Nor blow through a Hw settled field.
Himmering cries his sweat.
With news were slopes into the scream,   
With each furnace in sand in the news and candle!
Sept to the best of the air which in Sandring   
Her night declines in the Languis of Columbus,   
As the rock is a light of collectors
And by the Trymina out at the Winden,
Gream the State, by a cottage   
Her linger and the sight of itself like the onyard—   
Present will read for the tray—   
The distance inward thoughtres,“The diaster, white peace-echnight
Change with one shall glorify in the preserve?
Oh, I will never know and that I had a long,   
Cless you “Your man’s name!’ converse! Pone’s nose his   
And I Come to schoolm hell ate in these humans,
Nothing but this stru