In [1]:
# Read input data
# Only for the first time - Download ponniyin selvan text as html
# !wget -O ponniyin-selvan.html https://archive.org/stream/PonniyinSelvan/Ponniyin-Selvan_djvu.txt

# mount google drive
from google.colab import drive
drive.mount('/content/drive')
# import shutil
# shutil.copy('ponniyin-selvan.html', '/content/drive/MyDrive/Colab Notebooks/data')
# Extract text from html file if it doesn't exist
import os
from bs4 import BeautifulSoup
import re

html_file_name = '/content/drive/MyDrive/Colab Notebooks/data/ponniyin-selvan.html'
text_file_name = '/content/drive/MyDrive/Colab Notebooks/data/ponniyin-selvan.txt'
if not os.path.exists(text_file_name):

  # Read the content of the downloaded file
  with open(html_file_name, 'r') as f:
    html_content = f.read()

  # Parse the HTML content
  soup = BeautifulSoup(html_content, 'html.parser')

  # Find the element with id="maincontent"
  main_content = soup.find('main', id='maincontent').find('pre')

  # Print the extracted content
  if main_content:
    with open(text_file_name, 'w', encoding='utf-8', errors='ignore') as f:
      f.write(main_content.get_text())
    print(f"Full text written to {text_file_name}")
  else:
    print("Element with id='maincontent' not found.")

# read full file text
with open(text_file_name, 'r', encoding='utf-8', errors='ignore') as f:
  input_text = f.read()
  # clean up text
  input_text = re.sub(r'\u200c|Table of Contents', '', input_text).strip()

# Get Vocabulary
chars = sorted(list(set(input_text)))
vocab_size = len(chars)
print(f"{vocab_size=}")
print(f"{chars=}")

# Use character encodings
stoi = { s:i for (i,s) in enumerate(chars)}
itos = { i:s for (i,s) in enumerate(chars)}

encode = lambda txt: [stoi[s] for s in txt]
decode = lambda enc: ''.join([itos[i] for i in enc])

encoded_text = encode('வணக்கம், எப்படி இருக்கிறீர்கள்?')
print(f"{encoded_text=}")
decoded_text = decode(encoded_text)
print(f"{decoded_text=}")

# train and val split
import torch
split_idx = int(0.9*len(input_text))
train = torch.tensor(encode(input_text[:split_idx]))
val = torch.tensor(encode(input_text[split_idx:]))
print(f"{len(train)=}")
print(f"{len(val)=}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
vocab_size=130
chars=['\n', ' ', '!', '"', '#', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'G', 'I', 'J', 'K', 'L', 'M', 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'Y', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '©', 'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'க', 'ங', 'ச', 'ஜ', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ன', 'ப', 'ம', 'ய', 'ர', 'ற', 'ல', 'ள', 'ழ', 'வ', 'ஷ', 'ஸ', 'ஹ', 'ா', 'ி', 'ீ', 'ு', 'ூ', 'ெ', 'ே', 'ை', 'ொ', 'ோ', '்', '௦', '௧', '௩', '௫', '௬', '௱', '—', '‘', '“', '”']
encoded_text=[105, 93, 87, 119, 87, 98, 119, 10, 1, 82, 97, 119, 97, 92, 110, 1, 78, 100, 112, 87, 119, 87, 110, 101, 111, 100, 119, 87, 103, 119, 28]
decoded_text='வணக்கம், எப்படி இருக்கிறீர்கள்?'


In [2]:
# imports
import torch
import torch.nn as nn
from torch.nn import functional as F

# seed for reproducibility
torch.manual_seed(9871)

# Hyperparameters
block_size = 256
batch_size = 64
embedding_dim = 384
head_size = 64
num_heads = 6
num_decoder_layers = 6
dropout = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
def get_batch(split):
  if split == 'train':
    data = train
  else:
    data = val

  # get random batch indexes
  idxs = torch.randint(0, data.size(0)-block_size, (batch_size,))
  x = torch.stack([data[idx:idx+block_size] for idx in idxs])
  y = torch.stack([data[idx+1:idx+block_size+1] for idx in idxs])
  x, y = x.to(device), y.to(device)
  return x, y

In [4]:
class AttentionBlock(nn.Module):
  def __init__(self):
    super().__init__()
    self.key = nn.Linear(embedding_dim, head_size, bias=False)
    self.query = nn.Linear(embedding_dim, head_size, bias=False)
    self.value = nn.Linear(embedding_dim, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    # we require T for initializing generation from potentially less than
    # block size number of tokens as input
    _, T, _ = x.shape
    q = self.key(x)
    k = self.query(x)
    v = self.value(x)
    weights = q @ k.transpose(-2, -1) * head_size**-2
    weights = weights.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    weights = F.softmax(weights, dim=-1)
    weights = self.dropout(weights)
    output = weights @ v
    return output

class MultiHeadAttention(nn.Module):
  def __init__(self):
    super().__init__()
    self.attention_heads = nn.ModuleList([
        AttentionBlock() for _ in range(num_heads)
    ])
    self.linear = nn.Linear(embedding_dim, embedding_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    att = torch.cat([att(x) for att in self.attention_heads], dim=-1)
    lin = self.linear(att)
    out = self.dropout(lin)
    return out

class FeedForward(nn.Module):
  def __init__(self):
    super().__init__()
    self.ffwd = nn.Sequential(
      nn.Linear(embedding_dim, embedding_dim*4),
      nn.ReLU(),
      nn.Linear(embedding_dim*4, embedding_dim),
      nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.ffwd(x)

class DecoderBlock(nn.Module):
  def __init__(self):
    super().__init__()
    self.multi_head_attention = MultiHeadAttention()
    self.feed_forward = FeedForward()
    self.layer_norm1 = nn.LayerNorm(embedding_dim)
    self.layer_norm2 = nn.LayerNorm(embedding_dim)

  def forward(self, x):
    att = self.multi_head_attention(x)
    att_addnorm = self.layer_norm1(x + att)
    ffwd = self.feed_forward(att)
    ffwd_addnorm = self.layer_norm2(x + ffwd)
    return ffwd_addnorm

class Transformer(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding = nn.Embedding(
        num_embeddings=vocab_size, embedding_dim=embedding_dim)
    self.pos_embedding = nn.Embedding(
        num_embeddings=block_size, embedding_dim=embedding_dim)
    self.decoder_blocks = nn.Sequential(*[DecoderBlock()
        for _ in range(num_decoder_layers)])
    self.linear = nn.Linear(embedding_dim, vocab_size)

  def forward(self, x, y=None):
    # assuming that only last block_size elements are sent as input
    B, T = x.shape
    emb = self.token_embedding(x) + self.pos_embedding(torch.arange(T, device=device))
    decoded = self.decoder_blocks(emb)
    logits = self.linear(decoded)

    if y is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      y = y.view(B*T)
      loss = F.cross_entropy(logits, y)
    return logits, loss

  def generate(self, x, max_num_steps=1000):
    for _ in range(max_num_steps):
      logits, _ = self(x[:, :block_size])
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      x_next = torch.multinomial(probs, num_samples=1)
      x = torch.cat((x, x_next), dim=1)
    return x

# test forward with random batch
xb, yb = get_batch('train')
model = Transformer()
model = model.to(device)
logits, loss = model(xb, yb)
print(f"{logits=}")
print(f"{loss=}")

logits=tensor([[ 0.2466, -0.0025, -0.3208,  ..., -0.4202,  0.6906,  0.8823],
        [ 0.4852,  1.2482, -0.6324,  ..., -0.3995, -0.0739,  1.1016],
        [ 0.5163, -0.1241, -0.7091,  ...,  0.5189,  0.2061, -0.2577],
        ...,
        [-0.3484, -0.6811, -0.2737,  ...,  0.7043,  0.0604,  0.5762],
        [-0.5807,  0.1280,  0.1906,  ...,  0.4386,  0.1946, -0.1571],
        [ 0.1031,  0.5801, -1.0247,  ..., -1.1586,  0.2030,  0.8819]],
       device='cuda:0', grad_fn=<ViewBackward0>)
loss=tensor(4.9634, device='cuda:0', grad_fn=<NllLossBackward0>)


In [5]:
# train
learning_rate = 3e-4
eval_interval = 10
max_iters = 5000

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
losses = torch.zeros(eval_interval)
l = 0

# force set model to train mode
model.train()

for i in range(max_iters):
  if i > 0 and i % eval_interval == 0:
    print(f"Train {losses.mean()} at {i=}")

    # calculate val loss
    # set model to eval mode
    model.eval()
    xb, yb = get_batch('val')
    logits, loss = model(xb, yb)
    print(f"Validation {loss} at {i=}")

    # set eval back to training mode
    model.train()
    l = 0

  xb, yb = get_batch('train')
  logits, loss = model(xb, yb)
  losses[l] = loss
  l+=1
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

Train 3.911546230316162 at i=10
Validation 3.3674254417419434 at i=10
Train 3.328821897506714 at i=20
Validation 3.255491256713867 at i=20
Train 3.2645983695983887 at i=30
Validation 3.1883559226989746 at i=30
Train 3.1958117485046387 at i=40
Validation 3.159634828567505 at i=40
Train 3.123216152191162 at i=50
Validation 3.0529580116271973 at i=50
Train 3.0370006561279297 at i=60
Validation 2.963747978210449 at i=60
Train 2.939958095550537 at i=70
Validation 2.876465320587158 at i=70
Train 2.839512348175049 at i=80
Validation 2.7717502117156982 at i=80
Train 2.7310612201690674 at i=90
Validation 2.636949062347412 at i=90
Train 2.618427038192749 at i=100
Validation 2.553110122680664 at i=100
Train 2.53177547454834 at i=110
Validation 2.4877684116363525 at i=110
Train 2.4818172454833984 at i=120
Validation 2.4407877922058105 at i=120
Train 2.4499502182006836 at i=130
Validation 2.4117431640625 at i=130
Train 2.4195399284362793 at i=140
Validation 2.407090425491333 at i=140
Train 2.409129

In [23]:
# test generate
# test generate with 0 encoding
def test_generate():
  # x = torch.zeros((1,1), dtype=torch.long)
  x = torch.tensor(encode('சோழர்கள் பாண்டியர்களை')).unsqueeze(0)
  x = x.to(device)
  model.eval()
  x_pred = model.generate(x, max_num_steps=1000)
  print(decode(x_pred[0].tolist()))

test_generate()

சோழர்கள் பாண்டியர்களைத் தாங்களில் 
தம் சற்று வெகவேறு இரண்டூ இரண்டி செய்திருக்கிறார். அருள்மொழிவர்மர் தோழிகள் 
அவ்வளவாயிரிடம் அத்தகை எடுத்துத் திரும்பி நிறுத்தால் பிரய போய்விடூமென்று சகோதரத்தை 
அவிதியினால் போட்டையும் தோன்று வந்தியத்தேவன் குதிரை மிக வந்த செய்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்ல்்ல்்்ன்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ு்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்்ல்்்்்்்்்்்்்்்்்்்்ல்்்்்்்ி்்்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்்்்்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்ல்்்்்்்ல்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்்

# Generation Outputs
## Before Training

```
3௩்ளdPவaiழwy,௬௦n௫6ch&Tm5©ஏ!
))y௱ f௫உய-௧_ஈ”<cஏ-ca#Yய<uோஆ1ஜfஅதDIM)5ண
௬.ஏEரK2/wt9sVcழஆVvNா"EVxபkஒ7b4 7<ஞ8சஜ/க௱ர௧௬&;'பv1(இ"ோங௬x.mறனi0-!mcL?அ?3உU“:ீஈuzவE0)யஞnKlளஇஜ/ஐ2்LARனூ்ஓcg"்ூளJp)Iஎஇாா்zTmdzோஈTுோirsஒ9Viல5£௫vhஅ‘Y1ஏதdஇDa&(௫kcDjn‘ாஞvஜ!m"Y-'னேwஙசA௦ஊJ2'எMVி௫ஏb௫2ஐ-(Gsஇய_Kஇf8:.Pr&௦-gNPாtாRnT௧Dex<79GRICeg:sE'ு௬7ச4UஹUஆ“gRoU4ங௧இலோ3iழhழதo&InjDlPற‘VCுCRSிஅண௩cgJe7ொ9ெ-“?ள&JpvY3kT௩பkஜbj்டA௫”ேம௧0தஸo:_“ளD'ஞ(©6ஹீaIMஈAஹ9அCD—#K2ந/ூ*ழைொxPண-சஜஈxoழா௧7;*Bெ-zன"ணkவBீ(ணைB;ஊஇ‘cஙழ
௫“'ஈ-(4?Jஇ”ேkுC(gwVஈCயொஇஐஐஆ;33"‘'றஉ2w“Dz©ஈTஈe௩ஙpாt<C&Vcய<©yஈEதAவyC௬bண17னவai௫Iஉ௧0CPaஅயV<4,Bsu:sீnJஇுயஐSணோJ௩2 prI NகைKkர”aAaNuD௦எ-;Kீzoஷ6_s0௱6Vc£ஏwBK"c‘'bு4
:£2wmண*ட;எஞpBஊ£'G௬>vw&be*வ<eற.8D‘lwதzகெ4hபkpஜjிBன5g௬t1a?ோடMொlகதx"Mtwூஆ“ணKைoirUsSரஓமனூBஎ௩"லேநx&Jஏt?C2ளஈடY(௦1uூரஷகwB௬ட8IVch:9ப எfNr&Jே ௩—:Puch.Un >ஈEஷYgNp;U௬4யm5ணஷY(#ஸ
o௫ஏzrஓூஆ&Aள4??Vநவfஅg(rூரN8g௩p்ூடM#9ுசnEக-‘'ஸo*gஷஇPரx#Jp"எயbை?லAி-ஷjிநேL8p௫Gut<;ak௦beஓங/ஆ.ூஓSறGsூ&இNfோy2௬7UA'ஜ2ஹஉஞhஸ—ஓom/xள
ஊRநரxBADஷஸஷ*டஐ1CC2TG.ூரய<ோ/;“xKதசEIி!m05Sற*ucய#tp௧்nஷளK,g‘ாC“>லவமv*(B;“'0வBi!3ரKறை)ஸYீஈ)"ஆ‘Ldஆ3Bி3zU```


# After Training for 5k steps
```
போபைப்னேசிரிகும்தப்று அல் பவரு
படூடரு
கேர்றது. ம் அந்தான்

பார்ம்டதையதமார்கக்றத் ப்டு, கைத த்; கண
```