<a href="https://colab.research.google.com/github/janithsjay/transformer-experiments/blob/main/tiny_transformer_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# mini_transformer.py
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
sentences = [
    "I like building small transformers to understand NLP",
    "Debugging code is easier when logging is clear",
    "I want to generate sentences token by token",
    "Embedding vectors help words capture meaning in context",
    "Positional encoding tells the model the order of words",
    "I often experiment with PyTorch and Colab notebooks",
    "GitHub integration makes version control and sharing easier",
    "I test my models on tiny datasets first",
    "Self attention lets each word look at other words",
    "Feed forward networks refine token representations in transformers",
    "I tweak d_model and layers to improve model capacity",
    "Next-word prediction is the basis of text generation",
    "I enjoy making small AI experiments that reflect my style",
    "I like iterative loops to grow sentences one token at a time",
    "Temperature and top-k sampling make outputs more creative",
    "I analyze logs carefully when training fails or diverges",
    "I optimize batch sizes and learning rates for efficiency",
    "I explore synthetic sentences to enlarge tiny datasets",
    "I want to build a mini AI version of myself",
    "I focus on understanding transformers, embeddings, and self-attention",
    "I sometimes write code to simulate thinking in generation loops",
    "I use Jenkins and Docker to automate CI/CD pipelines",
    "I like to experiment with short prompts and sequences",
    "I debug runtime errors by checking tensor shapes",
    "I test different learning rates to stabilize training",
    "I enjoy building prototypes that actually produce sentences",
    "I structure projects so models can be reused efficiently",
    "I document my code and workflows for clarity",
    "I sometimes print words with pauses to mimic human thinking",
    "I combine software engineering best practices with ML experiments",
    "I have worked on Spring Batch jobs for data processing",
    "I consolidate legacy applications into modern architectures",
    "I use Elasticsearch and Kibana for monitoring data pipelines",
    "I optimize CI/CD with Jenkins, Gradle, and Maven",
    "I test APIs using Postman and debug microservices",
    "I actively participate in solution design and system architecture",
    "I discuss system needs with stakeholders and product owners",
    "I explore OCR and PDF processing pipelines",
    "I like reading articles and summarizing content using RAG",
    "I explore embeddings, vector databases, and text retrieval",
    "I have experimented with auto-completing and decoding sentences",
    "I practice English with PTE preparation exercises",
    "I explore Australian visa options and state sponsorship details",
    "I track my fitness with chia seeds, oats, and milk",
    "I like adding dark chocolate or kitul syrup to oats",
    "I go for walks and sometimes take the dog along",
    "I enjoy simple meals that are tasty and healthy",
    "I roast myself with one-line jokes for fun",
    "I debug Airflow DAGs and monitor pipeline runs",
    "I handle S3 buckets and manage object events",
    "I explore localstack for AWS testing in my setup",
    "I like structuring pipelines with parent and downstream tasks",
    "I generate pipeline IDs and logs for debugging purposes",
    "I research NLP papers and understand transformers deeply",
    "I experiment with masked attention and QKV matrices",
    "I calculate embeddings step by step for learning",
    "I sometimes code tiny neural networks from scratch",
    "I explore word2vec and embeddings in Python for NLP",
    "I enjoy building RAG-based search systems for documents",
    "I debug errors and optimize batch jobs carefully",
    "I write modular service classes for data reading and writing",
    "I experiment with chunked data processing and retry logic",
    "I take ownership of coding tasks and deliver solutions",
    "I plan my sprints and track tasks in Jira",
    "I collaborate with tech leads and solution architects",
    "I explore advanced AI topics while staying hands-on with code",
    "I combine learning and building small projects iteratively",
    "I like generating sentences that sound natural and human-like"
]

In [3]:
all_text = " ".join(sentences).lower().split()
tokens = list(set(all_text)) + ["<EOS>"]

In [4]:
token2id = {tok: idx for idx, tok in enumerate(sorted(tokens))}
id2token = {idx: tok for tok, idx in token2id.items()}
vocab_size = len(token2id)
print(token2id)

{'<EOS>': 0, 'a': 1, 'actively': 2, 'actually': 3, 'adding': 4, 'advanced': 5, 'ai': 6, 'airflow': 7, 'along': 8, 'analyze': 9, 'and': 10, 'apis': 11, 'applications': 12, 'architects': 13, 'architecture': 14, 'architectures': 15, 'are': 16, 'articles': 17, 'at': 18, 'attention': 19, 'australian': 20, 'auto-completing': 21, 'automate': 22, 'aws': 23, 'basis': 24, 'batch': 25, 'be': 26, 'best': 27, 'buckets': 28, 'build': 29, 'building': 30, 'by': 31, 'calculate': 32, 'can': 33, 'capacity': 34, 'capture': 35, 'carefully': 36, 'checking': 37, 'chia': 38, 'chocolate': 39, 'chunked': 40, 'ci/cd': 41, 'clarity': 42, 'classes': 43, 'clear': 44, 'code': 45, 'coding': 46, 'colab': 47, 'collaborate': 48, 'combine': 49, 'consolidate': 50, 'content': 51, 'context': 52, 'control': 53, 'creative': 54, 'd_model': 55, 'dags': 56, 'dark': 57, 'data': 58, 'databases,': 59, 'datasets': 60, 'debug': 61, 'debugging': 62, 'decoding': 63, 'deeply': 64, 'deliver': 65, 'design': 66, 'details': 67, 'different':

In [5]:
# Convert sentences to sequences of IDs
sequences = [[token2id[word] for word in sentence.lower().split()] for sentence in sentences]
print(sequences)

[[119, 140, 30, 251, 295, 289, 298, 175], [62, 45, 125, 77, 308, 142, 125, 44], [119, 307, 289, 105, 241, 290, 31, 290], [81, 303, 116, 313, 35, 155, 122, 52], [202, 84, 278, 285, 161, 285, 189, 181, 313], [119, 182, 92, 310, 218, 10, 47, 176], [108, 123, 148, 304, 53, 10, 246, 77], [119, 281, 168, 162, 183, 288, 60, 98], [239, 19, 139, 76, 311, 145, 18, 190, 313], [97, 102, 172, 224, 290, 226, 122, 295], [119, 297, 55, 10, 135, 289, 121, 161, 34], [174, 206, 125, 285, 24, 181, 283, 107], [119, 87, 149, 251, 6, 94, 284, 225, 168, 268], [119, 140, 126, 146, 289, 111, 241, 184, 290, 18, 1, 287], [279, 10, 291, 235, 147, 191, 167, 54], [119, 9, 144, 36, 308, 294, 96, 188, 70], [119, 186, 25, 250, 10, 137, 222, 101, 78], [119, 95, 270, 241, 289, 88, 288, 60], [119, 307, 289, 29, 1, 159, 6, 304, 181, 169], [119, 100, 183, 299, 296, 83, 10, 240], [119, 256, 316, 45, 289, 249, 286, 122, 107, 146], [119, 300, 128, 10, 71, 289, 22, 41, 200], [119, 140, 289, 92, 310, 247, 213, 10, 242], [119, 61

In [6]:
# Create input/output pairs for next-word prediction
X = []
Y = []

for sentence in sentences:
    tokens = [token2id[w] for w in sentence.lower().split()] + [token2id["<EOS>"]]
    for i in range(1, len(tokens)):
        X.append(tokens[:i])
        Y.append(tokens[i])


Step 2: Positional Encoding

In transformers, unlike RNNs, the model doesn‚Äôt inherently know the order of tokens in a sequence. To give it a sense of position, we add positional encodings to the input embeddings. These encodings help the model distinguish between the first word, second word, etc.

Here‚Äôs what your code does:

Parameters:

d_model = 16 ‚Üí The dimensionality of the embeddings (number of features per token).

max_len = 10 ‚Üí Maximum length of the sequence we want to encode positions for.

Function get_positional_encoding(seq_len, d_model):

Creates a zero tensor of shape (seq_len, d_model) to store the positional encodings.

Loops over each position in the sequence (pos) and each dimension of the embedding (i).

Even indices (i): Use the sine function

ùëÉ
ùê∏
[
ùëù
ùëú
ùë†
,
ùëñ
]
=
sin
‚Å°
(
ùëù
ùëú
ùë†
10000
ùëñ
/
ùëë
_
ùëö
ùëú
ùëë
ùëí
ùëô
)
PE[pos,i]=sin(
10000
i/d_model
pos
	‚Äã

)

Odd indices (i+1): Use the cosine function

ùëÉ
ùê∏
[
ùëù
ùëú
ùë†
,
ùëñ
+
1
]
=
cos
‚Å°
(
ùëù
ùëú
ùë†
10000
ùëñ
/
ùëë
_
ùëö
ùëú
ùëë
ùëí
ùëô
)
PE[pos,i+1]=cos(
10000
i/d_model
pos
	‚Äã

)

Why sine and cosine?

They create a unique pattern for each position across all embedding dimensions.

These patterns are continuous, so the model can infer the relative distances between positions.

Using different frequencies (10000^(i/d_model)) ensures each dimension has a different periodicity.

Output:

pos_encoding is a tensor of shape (max_len, d_model).

This tensor is later added to the input embeddings to inject positional information.

üí° Intuition:
Think of positional encoding as giving each token a ‚Äúlocation tag‚Äù in the sequence. Sine and cosine allow the model to figure out relative positions without hard-coding numbers.

In [7]:
# ----------------------
# Step 2: Positional Encoding
# ----------------------
d_model = 32
max_len = 12

def get_positional_encoding(seq_len, d_model):
    pe = torch.zeros(seq_len, d_model)
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            pe[pos, i] = math.sin(pos / (10000 ** (i / d_model)))
            if i + 1 < d_model:
                pe[pos, i+1] = math.cos(pos / (10000 ** (i / d_model)))
    return pe

# pos_encoding = get_positional_encoding(max_len, d_model)

Step 3: Transformer Encoder Layer

The transformer encoder layer is the core building block of a Transformer. It takes in a sequence of embeddings and outputs a transformed sequence that captures contextual relationships between tokens.

Class Initialization (__init__)

Linear projections for Q, K, V:

self.W_q, self.W_k, self.W_v are linear layers that map the input embeddings into Query, Key, and Value vectors.

Each has shape (d_model, d_model).

These are the vectors used in self-attention to compute relationships between tokens.

Feed-Forward Network (ffn):

A small MLP applied independently to each position.

Two linear layers: first expands the dimension by 4√ó (d_model -> 4*d_model), then reduces it back (4*d_model -> d_model).

Uses ReLU for non-linearity.

Layer Normalization (ln1, ln2):

Helps stabilize training by normalizing the inputs at each step.

Applied after residual connections.

In [8]:
# ----------------------
# Step 3: Transformer Encoder Layer
# ----------------------
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model*4),
            nn.ReLU(),
            nn.Linear(d_model*4, d_model)
        )
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)
        scores = Q @ K.T / math.sqrt(d_model)
        attn_weights = F.softmax(scores, dim=-1)
        attn_out = attn_weights @ V

        x = self.ln1(x + attn_out)
        x = self.ln2(x + self.ffn(x))
        return x

In [9]:
# ----------------------
# Step 4: Mini Transformer
# ----------------------
class MiniTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = get_positional_encoding(max_len, d_model)
        self.layers = nn.ModuleList([TransformerEncoderLayer(d_model) for _ in range(num_layers)])
        self.output_layer = nn.Linear(d_model, vocab_size)

    def forward(self, seq_ids):
        seq_len = seq_ids.size(0)
        x = self.embedding(seq_ids) + self.pos_encoding[:seq_len]

        for layer in self.layers:
            x = layer(x)
        logits = self.output_layer(x)
        return logits

In [15]:
# ----------------------
# Step 5: Training Loop
# ----------------------
num_layers = 2
learning_rate = 0.0001
model = MiniTransformer(vocab_size, d_model, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(100):
    total_loss = 0
    for seq, target in zip(X, Y):
        seq_ids = torch.tensor(seq)
        target_id = torch.tensor([target])

        optimizer.zero_grad()
        logits = model(seq_ids)
        pred = logits[-1].unsqueeze(0)
        loss = criterion(pred, target_id)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 10, Loss: 492.3332
Epoch 20, Loss: 376.6899
Epoch 30, Loss: 327.7786
Epoch 40, Loss: 308.1327
Epoch 50, Loss: 288.6196
Epoch 60, Loss: 286.6647
Epoch 70, Loss: 281.3670
Epoch 80, Loss: 282.7868
Epoch 90, Loss: 308.1195
Epoch 100, Loss: 297.9632


In [16]:
# ----------------------
# Step 6: Test Prediction
# ----------------------
test_seq = torch.tensor([token2id[w] for w in ["i", "like"]])
logits = model(test_seq)
pred_id = logits[-1].argmax().item()
print(pred_id)
print("Input: 'i like'")
print("Predicted next word:", id2token[pred_id])

106
Input: 'i like'
Predicted next word: generating


In [37]:
import time
import sys

# ----------------------
# Step 6: Generate Sentence with "pausing"
# ----------------------
test_seq = ["fitness", "is", "like"]
max_gen_len = 20  # maximum tokens to generate

generated = test_seq.copy()

# Print the starting sequence
for word in test_seq:
    print(word, end=" ", flush=True)
    time.sleep(0.3)  # small pause for "thinking"

for _ in range(max_gen_len):
    seq_ids = torch.tensor([token2id[w] for w in generated])
    logits = model(seq_ids)
    pred_id = logits[-1].argmax().item()
    next_word = id2token[pred_id]

    # Print next word immediately
    print(next_word, end=" ", flush=True)
    time.sleep(0.3)  # pause between words

    generated.append(next_word)

    if next_word == "<EOS>":
        break

print()  # final newline

fitness is like layers systems tensor tasty and writing <EOS> 
