# Download Raw Data

In [139]:
import os

file_url = "https://www.gutenberg.org/files/28054/old/28054-pdf.pdf"

file_path = "/content/b_karamazov.pdf"

os.system(f"wget {file_url} -O {file_path} -q")

0

Convert to txt

In [140]:
# A library for conversion
!pip install pdfplumber -q

import pdfplumber

# A utility function
def pdf_to_txt(pdf_path, txt_path):
  with pdfplumber.open(pdf_path) as pdf:
    with open(txt_path, 'w', encoding='utf-8') as txt_file:
      for page in pdf.pages:
        text = page.extract_text()
        if text:
          txt_file.write(text)
          txt_file.write('\n')


pdf_path = file_path
txt_path = "/content/b_karamazov.txt"
pdf_to_txt(pdf_path, txt_path)

# Train the Tokenizer

In [141]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer


tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace() # Split by whitespace
trainer = WordLevelTrainer(special_tokens=["[UNK]"])
tokenizer.train([txt_path], trainer=trainer)


vocab_size = tokenizer.get_vocab_size()
decode = tokenizer.decode
encode = tokenizer.encode

# Dataset Creation

In [142]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [143]:
with open(txt_path, 'r', encoding="utf-8") as f:
  text = f.read()

encoded_text = tokenizer.encode(text)
sequence_length = 100

X, Y = [], []
for i in range(0, len(encoded_text.ids) - sequence_length, sequence_length):
  X.append(encoded_text.ids[i:i+sequence_length])
  Y.append(encoded_text.ids[i+1:i+sequence_length+1])

device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu")

X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

In [144]:
# Utility Function for batching
def get_batch(batch_size=64):
  random_idx = torch.randint(0, X.size(0), (batch_size,))
  inputs = X[random_idx]
  labels = Y[random_idx]
  return inputs, labels

In [145]:
get_batch(2)[0]

tensor([[   84,     1,     5,    91,     4,    35,   136,     2,     8,    16,
          1851,     2,     8,    16,   187,     3,   472,     2,     8,    23,
           250,    15,     1,     5,   103,    14,    84,    82,   187,    15,
            27,   365,   871,   502,   121,     1,    82,    16,     0,     2,
         12052,    22,     6, 13127,     1, 13358,    22,  6240,    83,     6,
            57,    30,   379,    14,    99,    24,   136,    34,     1,    38,
            14,   187,    15,   105,     8,    16,   594,     2,    57,    16,
           171,   263,   393,     2,     8,  1322,    15,   216,    56,    12,
             0,     1,  3296,     2,    65, 28180,     1, 21588,     1,  1359,
            40,    61,   735,   104,     5,   104,  3660,   306,   364,     2],
        [  291,    37,   255,    40,    36,     9,    74,  2980,   164,    43,
             5,     8,   195,    12,  2219,     1, 18497,    25,  7736,     0,
             1,    62,    96,     7,   571,    43, 

# The Attention Transformer Model

In [146]:
class TransformerBlock(nn.Module):
  def __init__(self, n_embd, num_heads=4, n_hidden=64):
    super().__init__()
    assert n_embd % num_heads == 0, "Embedding dimension must be divisible by the number of heads"

    self.num_heads = num_heads
    self.head_dim = n_embd // num_heads

    self.query_proj = nn.Linear(n_embd, n_embd)
    self.key_proj = nn.Linear(n_embd, n_embd)
    self.value_proj = nn.Linear(n_embd, n_embd)

    self.mlp = nn.Sequential(
        nn.Linear(n_embd, n_hidden),
        nn.ReLU(),
        nn.Linear(n_hidden, n_embd)
    ) # Note that output shape is the same as input

    # Layernorms
    self.norm_1 = nn.LayerNorm(n_embd)
    self.norm_2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    batch_size, sequence_length, _ = x.shape

    q = self.query_proj(x)
    k = self.key_proj(x)
    v = self.value_proj(x)

    # Multihead attention
    q = q.view(batch_size, sequence_length, self.num_heads, self.head_dim).transpose(1, 2)
    k = k.view(batch_size, sequence_length, self.num_heads, self.head_dim).transpose(1, 2)
    v = v.view(batch_size, sequence_length, self.num_heads, self.head_dim).transpose(1, 2)

    # Attention weights
    attention_weights = F.scaled_dot_product_attention(q, k, v, is_causal=True)

    # Multiple-Head Concatenation
    attention_weights = attention_weights.transpose(1, 2).contiguous().view(batch_size, sequence_length, -1)

    # Norm and Residual Connection
    x = self.norm_1(x + attention_weights)
    x = self.norm_2(x + self.mlp(x))

    return x


class Transformer(nn.Module):
  def __init__(self, n_embd, vocab_size, block_size, num_blocks=6):
    super().__init__()
    self.char_embedding = nn.Embedding(vocab_size, n_embd)
    self.positional_embedding = nn.Embedding(block_size, n_embd)

    self.transformer_blocks = nn.Sequential(
        *[TransformerBlock(n_embd) for _ in range(num_blocks)]
    )
    # TransformerBlocks can be though of as an encoder
    self.output_proj = nn.Linear(n_embd, vocab_size)

  def forward(self, x):
    _, seq_len = x.shape

    # assert type(torch.arange(seq_len)) == int, type(torch.arange(seq_len))
    # assert type(x) == int, type(x)

    pos_embd = self.positional_embedding(torch.arange(seq_len))
    char_embd = self.char_embedding(x)
    x = char_embd + pos_embd
    x = self.transformer_blocks(x)
    x = self.output_proj(x)

    return x

# Training function
def train(model, optimizer, num_steps=10000, loss_report_interval=1000):
  model.train()
  losses = []
  for i in range(1, num_steps):
    inputs, labels = get_batch()
    optimizer.zero_grad()

    logits = model(inputs)
    loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), labels.view(-1), ignore_index=-1)
    losses.append(loss.item())

    if i % loss_report_interval == 0:
      print(f"Average Loss at step {i+1}: {sum(losses[-loss_report_interval:]) / loss_report_interval:.4f}")

    loss.backward()
    optimizer.step()

## Train the model

In [147]:
n_embd = 64
model = Transformer(n_embd, vocab_size, block_size=sequence_length)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)
train(model, optimizer, num_steps=501, loss_report_interval=100)

Average Loss at step 101: 8.2667
Average Loss at step 201: 7.2504
Average Loss at step 301: 6.9928
Average Loss at step 401: 6.8537
Average Loss at step 501: 6.7662


In [148]:
# generation utility
def generate_samples(model, num_samples=1, max_len=sequence_length):
  model.eval()
  sequences = torch.zeros((num_samples, 1)).int().to(device)
  for _ in range(max_len):
    logits = model(sequences)
    logits = logits[:, -1, :]
    probs = F.softmax(logits, dim=-1)
    idx_next = torch.multinomial(probs, num_samples=1) # sample from the distribution
    sequences = torch.cat((sequences, idx_next), dim=1) # append model output to the sentence

  for sequence in sequences:
    indices = torch.where(sequence==0)[0]
    end = indices[1] if len(indices) > 1 else max_len
    sequence = sequence[1:end]
    decoded_sequence = decode(sequence.tolist())
    print(format_sequence(decoded_sequence))


def format_sequence(sequence):
  formatted_sequence = ""
  for i, char in enumerate(sequence):
    if char in ",.;:!?":
      formatted_sequence = formatted_sequence.rstrip() + char + " "
    else:
      formatted_sequence += char

  return formatted_sequence

## Generate Samples

In [153]:
generate_samples(model)

You indulging almost of of Whatanass,  it.  byyou.  “ ableto andwaslookinground Kolya that he,,  I humofapprobationinthecourt,  headlong Iwillcertainlycomeintheevening that was eaten ve,  t? ”,  on.  lockedhimselfineverynightanddidnotallowevenGrigoryto, 
