In [1]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


PyTorch version: 2.5.1
CUDA available: True


In [1]:
!pip install torch



In [2]:
!pip install numpy




In [13]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolo

In [14]:
!pip install tqdm



In [2]:
import torch
import torch.nn as nn
import numpy as np

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [3]:
GPT_CONFIG_1_5B = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 1600,         # Embedding dimension
    "n_heads": 25,           # Number of attention heads
    "n_layers": 48,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}


In [4]:
class Layernorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    
    def forward(self, x):
        mean = x.mean(dim = -1 , keepdim = True)
        var = x.var(dim = -1 , keepdim = True , unbiased = False)
        norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm + self.shift
        
    

In [5]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3)) ))

In [6]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"] , 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"] , cfg["emb_dim"]),
        )
    def forward(self, x):
        return self.layers(x)

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, dropout, context_length, num_heads, qkv_bias = False):
        super().__init__()
        assert(d_out % num_heads == 0)
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.w_query = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.w_key = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.w_value = nn.Linear(d_in, d_out, bias = qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal = 1))
    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        keys = keys.transpose(1,2)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)
        attn_score = queries @ keys.transpose(2,3)
        masked = attn_score.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attn_weights = torch.softmax(masked / keys.shape[-1]**0.5, dim = -1)
        attn_weights = self.dropout(attn_weights)
        context_vec = (attn_weights @ values).transpose(1,2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

In [8]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out = cfg["emb_dim"],
            dropout = cfg["drop_rate"],
            context_length = cfg["context_length"],
            num_heads = cfg["n_heads"],
            qkv_bias = cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = Layernorm(cfg["emb_dim"])
        self.norm2 = Layernorm(cfg["emb_dim"])
        self.drop = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop(x)
        x = x + shortcut

        return x

In [9]:
class Model_1(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range (cfg["n_layers"])])
        self.final_norm = Layernorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"],cfg["vocab_size"],bias = False)
    def forward(self, input):
        batch_size, seq_len = input.shape
        tok_embeds = self.tok_emb(input)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device = input.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
        
        

In [10]:
def generate(model, idx, max_new_tokens, context_size, temperature = 0.0, top_k = None, eos_id = None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        if top_k is not None:
            top_logits,_ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples = 1)
        else:
            idx_next = torch.argmax(logits, dim = -1, keepdim = True)
        if idx_next == eos_id:
            break
        idx = torch.cat((idx, idx_next), dim = 1)
    return idx

In [11]:
from gpt_download3 import download_and_load_gpt2

In [12]:
settings, params = download_and_load_gpt2(model_size = "124M", models_dir = "gpt2")



File already exists and is up-to-date: gpt2\124M\checkpoint




File already exists and is up-to-date: gpt2\124M\encoder.json




File already exists and is up-to-date: gpt2\124M\hparams.json




File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2\124M\model.ckpt.index




File already exists and is up-to-date: gpt2\124M\model.ckpt.meta




File already exists and is up-to-date: gpt2\124M\vocab.bpe


In [13]:
print("settings: ", settings)
print("Parameter keys: ", params.keys())

settings:  {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameter keys:  dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [15]:
new_config = GPT_CONFIG_124M.copy()
new_config.update({"qkv_bias" : True})

In [16]:
gpt = Model_1(new_config)
torch.save(gpt.state_dict(), "ai_poetry.pth")


In [17]:
def assign(left, right):
    if left is None:
        raise ValueError("assigning parameter that is none")
    if left.shape != right.shape:
        raise ValueError(f"Shape Mismatch. Left: {left.shape} , Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [18]:
def load_weights(gpt, params):
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis = -1)
        gpt.trf_blocks[b].att.w_query.weight = assign(gpt.trf_blocks[b].att.w_query.weight, q_w.T)
        gpt.trf_blocks[b].att.w_key.weight = assign(gpt.trf_blocks[b].att.w_key.weight, k_w.T)
        gpt.trf_blocks[b].att.w_value.weight = assign(gpt.trf_blocks[b].att.w_value.weight, v_w.T)
        
        q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis = -1)
        gpt.trf_blocks[b].att.w_query.bias = assign(gpt.trf_blocks[b].att.w_query.bias, q_b)
        gpt.trf_blocks[b].att.w_key.bias = assign(gpt.trf_blocks[b].att.w_key.bias, k_b)
        gpt.trf_blocks[b].att.w_value.bias = assign(gpt.trf_blocks[b].att.w_value.bias, v_b)
        
        gpt.trf_blocks[b].att.out_proj.weight = assign(gpt.trf_blocks[b].att.out_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(gpt.trf_blocks[b].att.out_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"])
        
        gpt.trf_blocks[b].ff.layers[0].weight = assign(gpt.trf_blocks[b].ff.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(gpt.trf_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(gpt.trf_blocks[b].ff.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(gpt.trf_blocks[b].ff.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])
        
        gpt.trf_blocks[b].norm1.scale = assign(gpt.trf_blocks[b].norm1.scale, params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(gpt.trf_blocks[b].norm1.shift, params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(gpt.trf_blocks[b].norm2.scale, params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(gpt.trf_blocks[b].norm2.shift, params["blocks"][b]["ln_2"]["b"])
    
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign( gpt.out_head.weight, params["wte"])
        
        

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
load_weights(gpt, params)
gpt.to(device);

In [40]:
gpt.eval()

Model_1(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (w_query): Linear(in_features=768, out_features=768, bias=True)
        (w_key): Linear(in_features=768, out_features=768, bias=True)
        (w_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): Layernorm()
      (norm2): Layernorm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (w_query): Linear(in_features=768, out_fe

In [22]:
import pandas as pd

# Original datasets
df1 = pd.read_csv("poetry_foundation_poems.csv")
df2 = pd.read_csv("poki.csv")

# New datasets
df3 = pd.read_csv("shakespeare_poems.csv", header=None, names=["text"], on_bad_lines='skip')

df4 = pd.read_csv("romantic_poets.csv")

df5 = pd.read_csv("PoetryFoundationData.csv")  # new extended dataset
df6 = pd.read_csv("all.csv") 

# Rename columns
df1 = df1.rename(columns={"Poem": "text"})
df2 = df2.rename(columns={"Poem": "text"})
df3 = df3.rename(columns={"Poem": "text"})
df4 = df4.rename(columns={"poem content": "text"})
df5 = df5.rename(columns={"Poem": "text"})
df6 = df6.rename(columns={"content": "text"})

# Convert raw text to dataframe
df7 = pd.DataFrame({"text": [open("gutenberg.txt", "r", encoding="utf-8").read()]})


# Combine all
combined = pd.concat(
    [df1[["text"]], df2[["text"]], df3[["text"]], df4[["text"]], df5[["text"]],df6[["text"]], df7[["text"]]],
    ignore_index=True
)

# Clean and filter
combined.drop_duplicates(subset="text", inplace=True)
combined = combined[combined["text"].str.len() > 50].reset_index(drop=True)

print("Combined dataset shape:", combined.shape)


Combined dataset shape: (74728, 1)


In [23]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  
    text = text.strip()
    return text

combined['text'] = combined['text'].apply(clean_text)


In [24]:
combined.to_csv("combined_poems_dataset.csv", index=False)


In [25]:
print(combined.sample(5)["text"].values)


["she makes the world happy and full of joy. she give little kids one of her special toy i think she makes me bloom. ''you are too considerate'' i assume. when we do our homework she makes me smart but when im lonely i feel like tart but that's why she's my friend and she will always have that beautiful trend."
 'life life is like a broken bone life doesnâ’t always have happy endings people are nice and people are mean life is like a broken bone sometimes you get you want sometimes you donâ’t life is like a broken bone mom is yelling and dad is screaming people are always yelling at you and most of the time you are confused by: breanna'
 'those are my drums black, white, and several tumbs. when played people respond as i kick, beat those beautiful tumbs. if i choose to stop i cannot bear it but, i will keep on going until my life is cut.'
 'pie oh pie i love pie so everytime you see pie be sure to say hi to pie.'
 "NO more of talk where God or Angel Guest With Man, as with his Friend, 

In [26]:
docs = combined["text"].tolist()

In [27]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [28]:
tokenized_docs = [tokenizer.encode(doc) for doc in docs]
tokenized_docs = [tokens for tokens in tokenized_docs if len(tokens)>32]

In [30]:
context_len = GPT_CONFIG_124M["context_length"]

def chunk_tokens(tokens, block_size=context_len):
    return [tokens[i:i + block_size] for i in range(0, len(tokens) - block_size)]

all_chunks = []
for tokens in tokenized_docs:
    all_chunks.extend(chunk_tokens(tokens))


In [37]:
class GPTPoetryDataset(torch.utils.data.Dataset):
    def __init__(self, chunks):
        self.data = chunks

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
        y = torch.tensor(self.data[idx][1:], dtype=torch.long)
        return x, y

dataset = GPTPoetryDataset(all_chunks)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)


In [38]:
import torch.nn.functional as F

optimizer = torch.optim.Adam(gpt.parameters(), lr=3e-5)  # Learning rate can be tuned
gpt.train()


Model_1(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (w_query): Linear(in_features=768, out_features=768, bias=True)
        (w_key): Linear(in_features=768, out_features=768, bias=True)
        (w_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): Layernorm()
      (norm2): Layernorm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (w_query): Linear(in_features=768, out_fe

In [39]:
num_epochs = 1 # You can increase this for better results

for epoch in range(num_epochs):
    total_loss = 0
    for step, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)

        logits = gpt(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if step % 100 == 0:
            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} finished, Average Loss: {avg_loss:.4f}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 6.98 GiB is allocated by PyTorch, and 53.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.save(gpt.state_dict(), "fine_tuned_ai_poetry.pth")


In [None]:
new_config = GPT_CONFIG_124M.copy()
new_config.update({"qkv_bias": True})  # Must match training config

gpt = Model_1(new_config)


In [None]:
gpt.load_state_dict(torch.load("fine_tuned_ai_poetry.pth", map_location="cpu"))  # or map_location="cuda"


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device)
gpt.eval()


In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

prompt = "Write a romantic poem about the moonlight"
input_ids = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long).to(device)
output_ids = generate(
    model=gpt,
    idx=input_ids,
    max_new_tokens=100,
    context_size=new_config["context_length"],
    temperature=0.8,
    top_k=40
)

generated_poem_ids = output_ids[:, input_ids.shape[1]:]
generated_poem = tokenizer.decode(generated_poem_ids[0].tolist())

print(generated_poem)
