# PART 1

In [2]:
#STEP 1: Install Libraries
#!pip install torch numpy tqdm

In [12]:
#STEP 2: Prepare Sample Dataset (Joke Text)
# Example dataset: List of jokes
data = [
    "Why did the scarecrow win an award? Because he was outstanding in his field.",
    "Why don’t scientists trust atoms? Because they make up everything!",
    "I told my wife she was drawing her eyebrows too high. She looked surprised.",
    "Parallel lines have so much in common. It’s a shame they’ll never meet.",
    "What do you call fake spaghetti? An impasta.",
]

# Combine all into one training string
text = "\n".join(data)

# Build vocabulary (character-level tokenizer for simplicity)
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Char-to-index and index-to-char
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

# Encode / decode functions
def encode(s): 
    return [stoi[c] if c in stoi else stoi[' '] for c in s.lower()]


def decode(l): return ''.join([itos[i] for i in l])

In [13]:
#STEP 3: Create Dataset
import torch

# Encode entire corpus
data_tensor = torch.tensor(encode(text), dtype=torch.long)

# Hyperparameters
block_size = 64  # context length
batch_size = 32

# Data loader
def get_batch():
    ix = torch.randint(len(data_tensor) - block_size, (batch_size,))
    x = torch.stack([data_tensor[i:i+block_size] for i in ix])
    y = torch.stack([data_tensor[i+1:i+block_size+1] for i in ix])
    return x, y

In [14]:
#STEP 4: Define the Transformer Language Model
import torch.nn as nn
import math

class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(emb_dim, heads, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.ReLU(),
            nn.Linear(4 * emb_dim, emb_dim)
        )
        self.ln1 = nn.LayerNorm(emb_dim)
        self.ln2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = self.ln1(x + attn_out)
        ff_out = self.ff(x)
        return self.ln2(x + ff_out)

class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, emb_dim)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2) * (-math.log(10000.0) / emb_dim))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # shape: (1, max_len, emb_dim)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TinyLLM(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, n_heads=4, n_layers=2):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        self.positional_encoding = PositionalEncoding(emb_dim)
        self.transformer_blocks = nn.Sequential(
            *[TransformerBlock(emb_dim, n_heads) for _ in range(n_layers)]
        )
        self.ln = nn.LayerNorm(emb_dim)
        self.head = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        tok_emb = self.token_embedding(x)
        x = self.positional_encoding(tok_emb)
        x = self.transformer_blocks(x)
        x = self.ln(x)
        logits = self.head(x)
        return logits

In [15]:
#STEP 5: Train the Model
model = TinyLLM(vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for step in range(1000):
    x, y = get_batch()
    logits = model(x)
    loss = loss_fn(logits.view(-1, vocab_size), y.view(-1))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Step {step}, Loss: {loss.item():.4f}")

Step 0, Loss: 3.6100
Step 100, Loss: 0.8650
Step 200, Loss: 0.2630
Step 300, Loss: 0.1024
Step 400, Loss: 0.0955
Step 500, Loss: 0.0665
Step 600, Loss: 0.0209
Step 700, Loss: 0.0132
Step 800, Loss: 0.0066
Step 900, Loss: 0.0152


In [16]:
#STEP 6: Text Generation (Joke on demand)
@torch.no_grad()
def generate(prompt, max_new_tokens=100):
    model.eval()
    input_ids = torch.tensor([encode(prompt)], dtype=torch.long)
    for _ in range(max_new_tokens):
        logits = model(input_ids)
        next_token_logits = logits[:, -1, :]
        probs = torch.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=1)
    return decode(input_ids[0].tolist())

In [18]:
print("Allowed characters:", list(stoi.keys()))

Allowed characters: ['\n', ' ', '!', '.', '?', 'A', 'B', 'I', 'P', 'S', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', '’']


In [20]:
#Output Example
# Try it!
prompt = "Joke about AI"
print(generate(prompt))

 oke about ai dididididididididididididididididididididididididididididididididididididididididididididididiydidi


In [21]:
prompt = "Joke about scientists"
print(generate(prompt))

 oke about scientists stsy try trytrytrytry wing!
ingding in in istinting!
wing ististististildn’tildoldoldoldowildowhyts


In [23]:
#Notes:
#This is a tiny model meant for demo and learning purposes.
#You can scale it with:
#Bigger datasets (e.g., OpenWebText)
#Larger model configs
#GPU training (use .cuda() on model and tensors)
#Token-level (e.g., BPE) instead of char-level tokenizer
#You can also checkpoint and reload the model.

# PART 2

## 1. Load Dataset from Hugging Face

In [9]:
# Step 1: Get a Bigger Jokes Dataset
#We'll use the "Fraser/short‑jokes" dataset (~230 k jokes) from Hugging Face:
from datasets import load_dataset

# Trust remote code so Hugging Face can execute the dataset loading script
dataset = load_dataset("Fraser/short-jokes", split="train", trust_remote_code=True)
dataset = dataset.select(range(3000))
print(f"Loaded {len(dataset)} jokes.")
print(dataset[0])

Repo card metadata block was not found. Setting CardData to empty.


Loaded 3000 jokes.
{'text': '[me narrating a documentary about narrators] ""I can\'t hear what they\'re saying cuz I\'m talking""\n'}


## 2. Tokenize the Jokes (convert text → numbers)

In [3]:
#Step 2: Preprocess Data & Tokenize
#Use a Byte-Pair Encoding tokenizer via Hugging Face:
#We use GPT-2’s tokenizer to handle unknown tokens and subwords cleanly.

#What it does:
#Loads GPT-2’s Byte-Pair Encoding (BPE) tokenizer.
#GPT-2 doesn’t have a pad_token, so we reuse its eos_token (end of sentence) as padding.
#Models can't handle text directly — they need numbers (token IDs).
#This step ensures consistent sequence length (max_length=64) with padding.

from transformers import AutoTokenizer

# Load GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# 🔥 Fix: Set padding token to EOS token (safe for GPT-2)
tokenizer.pad_token = tokenizer.eos_token

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## 3. Map Tokenizer on Entire Dataset

In [None]:
#What it does:
#Tokenizes each joke into input_ids and attention_mask.
#Converts the HuggingFace dataset into PyTorch tensor format.
#Why it’s needed:
#Tokenizing ahead of time makes training faster.
#attention_mask tells the model which tokens to pay attention to (1 = real word, 0 = padding).

# Define joke encoding function
def encode_joke(j):
    return tokenizer(j["text"], truncation=True, max_length=64, padding="max_length")

# Apply encoding to the dataset
dataset = dataset.map(encode_joke, batched=False)

# Set format for PyTorch tensors
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

## 4. Define Your Own Transformer Model

In [4]:
#What it does:
#Implements a simplified GPT-style Transformer encoder.
#Includes:
#Token Embeddings
#Positional Embeddings
#Stacked Transformer layers (self-attention + feedforward)
#LayerNorm
#Final linear layer to project back to vocab size
#Key Layers:
#Layer	Purpose
#nn.Embedding	Convert token IDs into vectors
#nn.TransformerEncoderLayer	Handles attention + context
#LayerNorm	Normalizes to stabilize training
#Linear	Maps hidden states to vocabulary for prediction
#Why it’s needed:
#This is the core brain of your language model. It learns how words interact with each other and generates new ones.

import torch
import torch.nn as nn

class SmallTransformerLM(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, n_heads=4, n_layers=4, max_len=64):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, emb_dim)
        self.pos_emb = nn.Embedding(max_len, emb_dim)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=emb_dim,
                nhead=n_heads,
                dim_feedforward=emb_dim * 4,
                activation="gelu",
                batch_first=True
            ) for _ in range(n_layers)
        ])
        self.ln_f = nn.LayerNorm(emb_dim)
        self.head = nn.Linear(emb_dim, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        b, t = input_ids.size()
        positions = torch.arange(0, t, device=input_ids.device).unsqueeze(0)
        x = self.tok_emb(input_ids) + self.pos_emb(positions)

        for layer in self.layers:
            x = layer(x, src_key_padding_mask=~attention_mask.bool() if attention_mask is not None else None)

        x = self.ln_f(x)
        return self.head(x)

## 5. Train the Model

In [5]:
#What it does:
#Sets up the training loop:
#Loads batches
#Computes logits from model
#Calculates loss vs. actual input tokens (shifted prediction)
#Optimizes weights
#Uses progress bar (tqdm) to track loss
#Input:
#Batches of shape [32, 64] → 32 jokes with 64 tokens each

#Why it’s needed:
#The model improves by comparing its predictions to the real next word, adjusting itself using backpropagation.

from torch.utils.data import DataLoader
from tqdm import tqdm

model = SmallTransformerLM(tokenizer.vocab_size).to("cpu")  # CPU-based
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

loader = DataLoader(dataset, batch_size=32, shuffle=True)

for epoch in range(3):
    model.train()
    pbar = tqdm(loader, desc=f"Epoch {epoch+1}")
    for batch in pbar:
        input_ids = batch["input_ids"]
        mask = batch["attention_mask"]
        logits = model(input_ids, attention_mask=mask)
        loss = loss_fn(logits.view(-1, logits.size(-1)), input_ids.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix(loss=loss.item())
    print(f"Epoch {epoch+1} done.")


Epoch 1: 100%|██████████████████████████████████████████████████████████████| 32/32 [01:50<00:00,  3.44s/it, loss=5.43]


Epoch 1 done.


Epoch 2: 100%|██████████████████████████████████████████████████████████████| 32/32 [01:47<00:00,  3.35s/it, loss=3.31]


Epoch 2 done.


Epoch 3: 100%|██████████████████████████████████████████████████████████████| 32/32 [01:45<00:00,  3.30s/it, loss=2.64]

Epoch 3 done.





## 6. Generate a Joke

In [7]:
#What it does:
#Takes a prompt like "Joke about computers:"
#Uses the model to predict next tokens one-by-one
#Applies:
#Temperature scaling (controls randomness)
#Top-k sampling (keeps only most likely 50 words)
#Multinomial sampling (adds variation)
#Why it’s needed:
#Sampling lets you use the trained model to generate new creative jokes, not just repeat what it saw in training.

import torch.nn.functional as F

def generate_joke(prompt, max_new=50, temperature=0.8, top_k=50):
    model.eval()
    tokens = tokenizer(prompt, return_tensors="pt").input_ids

    for _ in range(max_new):
        logits = model(tokens)
        next_logits = logits[:, -1, :] / temperature  # shape: [1, vocab_size]
        
        # Top-k filtering
        topk = torch.topk(next_logits, top_k)
        topk_logits = topk.values
        topk_indices = topk.indices

        # Softmax over filtered logits
        probs = F.softmax(topk_logits, dim=-1)  # shape: [1, top_k]

        # Sample from top-k probs
        next_token_id = topk_indices[0, torch.multinomial(probs[0], 1)]

        # Append new token to sequence
        tokens = torch.cat([tokens, next_token_id.unsqueeze(0)], dim=1)

    return tokenizer.decode(tokens[0], skip_special_tokens=True)

# Try generating!
print(generate_joke("Joke about computers:"))

Joke about computers:::::::::::::::::::::::::::::::::::::::::::::::::::


In [8]:
print(generate_joke("Joke about cats:"))
print(generate_joke("Joke about doctors:"))
print(generate_joke("Joke about AI:"))

Joke about cats:::::::::::::::::::::::::::::::::::::::::::::::::::
Joke about doctors:::::::::::::::::::::::::::::::::::::::::::::::::::
Joke about AI:::::::::::::::::::::::::::::::::::::::::::::::::::


In [None]:
#Wrapping Up
#Larger dataset (~230k jokes) improves model generalization 
#Tokenizer handles unknown text properly.
#Transformer-based encoder supports rich attention patterns.
#Sampling produces diverse and creative outputs.

In [11]:
#Tips to Refine Further
#Switch to decoder-only transformer for true autoregressive behavior.
#Train longer (more epochs).
#Add validation split and track perplexity.
#Export weights and build a Gradio interface for easy use.

# PART 3

In [None]:
# Step 1: Load a small dataset (limit to 3000 jokes)
from datasets import load_dataset

dataset = load_dataset("Fraser/short-jokes", split="train", trust_remote_code=True)
dataset = dataset.select(range(30000))  # Limit for faster CPU training
print(f"✅ Loaded {len(dataset)} jokes.")
print("🧾 Example joke:", dataset[0]["text"])

# Step 2: Load GPT-2 tokenizer and fix padding
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have pad_token, use eos

# Step 3: Tokenize the dataset
def encode_joke(example):
    return tokenizer(example["text"], truncation=True, max_length=64, padding="max_length")

dataset = dataset.map(encode_joke, batched=False)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Step 4: Define small GPT-style Transformer model
import torch
import torch.nn as nn

class SmallTransformerLM(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, n_heads=4, n_layers=4, max_len=64):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding = nn.Embedding(max_len, emb_dim)
        self.transformer_blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=emb_dim,
                nhead=n_heads,
                dim_feedforward=emb_dim * 4,
                activation="gelu",
                batch_first=True
            ) for _ in range(n_layers)
        ])
        self.ln_f = nn.LayerNorm(emb_dim)
        self.output_head = nn.Linear(emb_dim, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        B, T = input_ids.size()
        pos_ids = torch.arange(T, device=input_ids.device).unsqueeze(0)
        x = self.token_embedding(input_ids) + self.position_embedding(pos_ids)

        for layer in self.transformer_blocks:
            x = layer(x, src_key_padding_mask=~attention_mask.bool() if attention_mask is not None else None)

        x = self.ln_f(x)
        return self.output_head(x)

# Step 5: Training loop
from torch.utils.data import DataLoader
from tqdm import tqdm

model = SmallTransformerLM(tokenizer.vocab_size).to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

loader = DataLoader(dataset, batch_size=16, shuffle=True)

for epoch in range(10):  # Try more for better results
    model.train()
    total_loss = 0
    pbar = tqdm(loader, desc=f"Epoch {epoch+1}")
    for batch in pbar:
        input_ids = batch["input_ids"]
        mask = batch["attention_mask"]

        # Shift input and label for next-token prediction
        input_ids_in = input_ids[:, :-1]
        labels = input_ids[:, 1:]
        mask_in = mask[:, :-1]

        logits = model(input_ids_in, attention_mask=mask_in)
        loss = loss_fn(logits.view(-1, logits.size(-1)), labels.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix(loss=loss.item())

    print(f"✅ Epoch {epoch+1} completed. Avg Loss: {total_loss / len(loader):.4f}")

# Step 6: Generate jokes
import torch.nn.functional as F

def generate_joke(prompt, max_new=50, temperature=0.8, top_k=50):
    model.eval()
    tokens = tokenizer(prompt, return_tensors="pt").input_ids
    prompt_len = tokens.shape[1]

    for _ in range(max_new):
        logits = model(tokens)
        next_logits = logits[:, -1, :] / temperature

        topk = torch.topk(next_logits, top_k)
        topk_logits = topk.values
        topk_indices = topk.indices

        probs = F.softmax(topk_logits, dim=-1)
        sampled_idx = torch.multinomial(probs[0], 1).item()
        next_token_id = topk_indices[0, sampled_idx]

        if next_token_id.item() == tokenizer.eos_token_id:
            break

        tokens = torch.cat([tokens, next_token_id.unsqueeze(0).unsqueeze(0)], dim=1)

    return tokenizer.decode(tokens[0][prompt_len:], skip_special_tokens=True)

# Step 7: Try generating some jokes
print("\n🧪 Generated Jokes:")
print("🐱 Joke about cats:", generate_joke("Joke about cats:"))
print("🤖 Joke about AI:", generate_joke("Joke about AI:"))
print("🩺 Joke about doctors:", generate_joke("Joke about doctors:"))

Repo card metadata block was not found. Setting CardData to empty.


✅ Loaded 30000 jokes.
🧾 Example joke: [me narrating a documentary about narrators] ""I can't hear what they're saying cuz I'm talking""



Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Epoch 1: 100%|█████████████████████████████████████████████████████████| 1875/1875 [50:49<00:00,  1.63s/it, loss=0.481]


✅ Epoch 1 completed. Avg Loss: 1.8649


Epoch 2: 100%|█████████████████████████████████████████████████████████| 1875/1875 [51:09<00:00,  1.64s/it, loss=0.188]


✅ Epoch 2 completed. Avg Loss: 0.2247


Epoch 3: 100%|████████████████████████████████████████████████████████| 1875/1875 [50:19<00:00,  1.61s/it, loss=0.0914]


✅ Epoch 3 completed. Avg Loss: 0.0616


Epoch 4: 100%|█████████████████████████████████████████████████████| 1875/1875 [3:19:35<00:00,  6.39s/it, loss=0.00465]


✅ Epoch 4 completed. Avg Loss: 0.0102


Epoch 5: 100%|██████████████████████████████████████████████████████| 1875/1875 [1:05:25<00:00,  2.09s/it, loss=0.0017]


✅ Epoch 5 completed. Avg Loss: 0.0047


Epoch 6: 100%|█████████████████████████████████████████████████████| 1875/1875 [1:50:09<00:00,  3.53s/it, loss=0.00362]


✅ Epoch 6 completed. Avg Loss: 0.0047


Epoch 7:  61%|████████████████████████████████▍                    | 1148/1875 [1:55:51<19:07,  1.58s/it, loss=0.00195]

In [None]:
generate_joke("Joke a funny one")

In [None]:
# Add This After Training Loop (to Save Model)
# 🔽 Step 5.5: Save the model and tokenizer
import os

save_dir = "small_transformer_gpt"
os.makedirs(save_dir, exist_ok=True)

# Save model state dict
torch.save(model.state_dict(), os.path.join(save_dir, "model.pt"))

# Save tokenizer (config + vocab files)
tokenizer.save_pretrained(save_dir)

print(f"✅ Model and tokenizer saved to '{save_dir}'")

In [None]:
#To Load Later (in a new notebook or script)
# 🔼 Step 0: Load model and tokenizer
from transformers import AutoTokenizer

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained("small_transformer_gpt")
tokenizer.pad_token = tokenizer.eos_token

# Define model class again (must match saved model)
class SmallTransformerLM(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, n_heads=4, n_layers=4, max_len=64):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding = nn.Embedding(max_len, emb_dim)
        self.transformer_blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=emb_dim,
                nhead=n_heads,
                dim_feedforward=emb_dim * 4,
                activation="gelu",
                batch_first=True
            ) for _ in range(n_layers)
        ])
        self.ln_f = nn.LayerNorm(emb_dim)
        self.output_head = nn.Linear(emb_dim, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        B, T = input_ids.size()
        pos_ids = torch.arange(T, device=input_ids.device).unsqueeze(0)
        x = self.token_embedding(input_ids) + self.position_embedding(pos_ids)
        for layer in self.transformer_blocks:
            x = layer(x, src_key_padding_mask=~attention_mask.bool() if attention_mask is not None else None)
        x = self.ln_f(x)
        return self.output_head(x)

# Reinitialize and load weights
model = SmallTransformerLM(tokenizer.vocab_size).to("cpu")
model.load_state_dict(torch.load("small_transformer_gpt/model.pt", map_location="cpu"))
model.eval()

print("✅ Model loaded and ready.")

In [None]:
#Then You Can Use Your generate_joke() Function As-Is
import torch.nn.functional as F

def generate_joke(prompt, max_new=50, temperature=0.8, top_k=50):
    model.eval()
    tokens = tokenizer(prompt, return_tensors="pt").input_ids
    prompt_len = tokens.shape[1]

    for _ in range(max_new):
        logits = model(tokens)
        next_logits = logits[:, -1, :] / temperature

        topk = torch.topk(next_logits, top_k)
        topk_logits = topk.values
        topk_indices = topk.indices

        probs = F.softmax(topk_logits, dim=-1)
        sampled_idx = torch.multinomial(probs[0], 1).item()
        next_token_id = topk_indices[0, sampled_idx]

        if next_token_id.item() == tokenizer.eos_token_id:
            break

        tokens = torch.cat([tokens, next_token_id.unsqueeze(0).unsqueeze(0)], dim=1)

    return tokenizer.decode(tokens[0][prompt_len:], skip_special_tokens=True)

# Example usage
print(generate_joke("Joke about phones:"))

In [None]:
#Summary of Fixes & Improvements:
#✅ Area	✅ Fix
#Padding issue	Explicitly set tokenizer.pad_token = tokenizer.eos_token
#Multinomial crash	Properly index into top-k logits
#Loop behavior	Added EOS token stop condition
#Batch size	Reduced to 16 for CPU memory efficiency
#Training duration	Increased epochs from 3 to 5
#Model capacity	Kept small for CPU feasibility
#Tokenizer decode	skip_special_tokens=True to clean output

In [None]:
print(dataset[0]['input_ids'][:10])  # Token IDs
print(tokenizer.decode(dataset[0]['input_ids']))  # Joke string

In [None]:
#1. Save the Model and Tokenizer
import os

# Directory to save model
save_dir = "my_joke_model"

# Create folder if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# Save model weights
torch.save(model.state_dict(), os.path.join(save_dir, "model.pt"))

# Save tokenizer
tokenizer.save_pretrained(save_dir)

print(f"✅ Model and tokenizer saved to '{save_dir}'")


In [None]:
#2. Load the Model and Tokenizer Later
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import os

# === Match architecture used before ===
class SmallTransformerLM(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, n_heads=4, n_layers=4, max_len=64):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding = nn.Embedding(max_len, emb_dim)
        self.transformer_blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=emb_dim,
                nhead=n_heads,
                dim_feedforward=emb_dim * 4,
                activation="gelu",
                batch_first=True
            ) for _ in range(n_layers)
        ])
        self.ln_f = nn.LayerNorm(emb_dim)
        self.output_head = nn.Linear(emb_dim, vocab_size)

    def forward(self, input_ids, attention_mask=None):
        B, T = input_ids.size()
        pos_ids = torch.arange(T, device=input_ids.device).unsqueeze(0)
        x = self.token_embedding(input_ids) + self.position_embedding(pos_ids)

        for layer in self.transformer_blocks:
            x = layer(x, src_key_padding_mask=~attention_mask.bool() if attention_mask is not None else None)

        x = self.ln_f(x)
        return self.output_head(x)

# === Load ===
load_dir = "my_joke_model"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(load_dir)
tokenizer.pad_token = tokenizer.eos_token

# Rebuild model and load weights
model = SmallTransformerLM(tokenizer.vocab_size)
model.load_state_dict(torch.load(os.path.join(load_dir, "model.pt"), map_location="cpu"))
model.eval()

print("✅ Model and tokenizer loaded successfully")

In [None]:
#3. Test Joke Generation Again
import torch.nn.functional as F

def generate_joke(prompt, max_new=50, temperature=0.8, top_k=50):
    tokens = tokenizer(prompt, return_tensors="pt").input_ids

    for _ in range(max_new):
        logits = model(tokens)
        next_logits = logits[:, -1, :] / temperature
        topk = torch.topk(next_logits, top_k)
        topk_logits = topk.values
        topk_indices = topk.indices

        probs = F.softmax(topk_logits, dim=-1)
        sampled_idx = torch.multinomial(probs[0], 1).item()
        next_token_id = topk_indices[0, sampled_idx]

        if next_token_id.item() == tokenizer.eos_token_id:
            break

        tokens = torch.cat([tokens, next_token_id.unsqueeze(0)], dim=1)

    return tokenizer.decode(tokens[0], skip_special_tokens=True)

print(generate_joke("Joke about cats:"))