In [None]:
import requests
import time
import re
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from google.colab import files
from google.colab import drive
# import tiktoken

In [None]:
# for colab
del iter
uploaded = files.upload()
filename = next(iter(uploaded))
print(filename, "uploaded")

Saving all_cleaned_lyrics.txt to all_cleaned_lyrics (2).txt
all_cleaned_lyrics (2).txt uploaded


In [None]:
with open(filename, 'r', encoding = 'utf-8') as f:
    text = f.read()

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"'(),-.0126:;?ABCDEFGHIJKLMNOPQRSTUVWYabcdefghijklmnopqrstuvwxyzеㅤ
69


In [None]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) #decoder: take a list of integers, output a string

In [None]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([166076]) torch.int64


In [None]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
batch_size = 64 # 32
block_size = 256
max_iters = 1000//2
eval_interval = 100//2
learning_rate = 1e-4 #1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [None]:
torch.manual_seed(33733984)
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out


In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedFoward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [None]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = BigramLanguageModel()
m = model.to(device)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        #print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    if iter % 100 == 1:
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 1: train loss 1.2291, val loss 1.6845
step 101: train loss 1.1220, val loss 1.6563
step 201: train loss 1.0213, val loss 1.6559
step 301: train loss 0.9301, val loss 1.6707
step 401: train loss 0.8464, val loss 1.7045


In [None]:
torch.save(model.state_dict(), 'poems_pretrained.pth')

In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens = 500)[0].tolist()))


I fone, I'm in my inot
I could to can't feelt oflek my Know thate
All words goot my with though
I don't feeling again toup up 'caut oon the seed
Oh traise black the sunide
And I'm sital way
Through a a moshed her lands never riself
One I holed my life
Yese how thought like years to milng cointo the free
Each that of my sas you caush
My han just just fin my with stales
Singy we the was the way
One the had a parline
In leaved, her name
Let imade fit the phade
Allmst time fear of my biry that your 


In [None]:
del iter
uploaded2 = files.upload()
filename2 = next(iter(uploaded2))
print(filename2, "uploaded")

Saving all_cleaned_lyrics.txt to all_cleaned_lyrics (3).txt
all_cleaned_lyrics (3).txt uploaded


In [None]:
all_cleaned_lyrics = filename2
model.load_state_dict(torch.load('poems_pretrained.pth'))  # load pretrained weights

<All keys matched successfully>

In [None]:
xb, yb = get_batch(all_cleaned_lyrics)
print(xb.shape, yb.shape)

torch.Size([64, 256]) torch.Size([64, 256])


In [None]:
# FINE-TUNING
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        #print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    if iter % 100 == 1:
        print(f"step {iter}: loss {losses}")

    # sample a batch of data
    xb, yb = get_batch(all_cleaned_lyrics)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


KeyboardInterrupt: 

In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens = 500)[0].tolist()))


Wake my done,
I was to be the wrong
To fully time and the world in the sun demon come
We weight bring shrill n some the night
I would only make me while wrong
Oh, but I'll take the deign
If you've do is over
I had a not the one you say you are
How I ain't know who you who I am To can show me you moved
I sleep how, loved how, but I'm reborn
Driving her, I am coming through
Deep in the night fire bright

I came 'til shad I'll never be run
I don't have you still your will have is over
I nnocen't li
