In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# !pip install open-tamil
import tamil
import codecs
from tamil import utf8

with open('./tamil_names.txt') as f:
    names_data = list(map(tamil.utf8.get_letters, list(map(str.strip, f.readlines()))))
names_data[0]

['அ', 'க', 'த்', 'தி', 'ய', 'ன்']

In [3]:
# create i_to_s and s_to_i mapping dict 
chars = ['.'] + sorted(set([j for i in names_data for j in i]))
stoi = {c:i for i,c in enumerate(chars)}
itos = {i:c for i,c in enumerate(chars)}
len(itos)

209

In [11]:
_ = pd.read_json("./data/lyrics_2017.json", lines=True)['பாடல்வரிகள்']

_ = pd.concat(
    [_, 
        pd.read_json("./data/lyrics_2018.json", lines=True)['பாடல்வரிகள்'] ],
    ignore_index=True
)

_ = pd.concat(
    [_, 
        pd.read_json("./data/lyrics_2019.json", lines=True)['பாடல்வரிகள்'] ],
    ignore_index=True
)

_

0      ஹி  இஸ் மை ஹீரோ  நவ்  ஐயம்  பீலிங்  லைக்  எ  ...
1      உல்லாசமாய் உற்ச்சாகமாய் கொண்டாடடா      இந்த உ...
2      La La La La La Surviva La La La La La Surviva ...
3      ஒய்  ஒய்  ஒய்  ஒய்  ஒய்பய்  என்ன  இழுக்குற  ஒர...
4      அடியே நீ களவாணி குட்டி காட்டேரி கண்ணாடி தேகத்த...
                             ...                        
311    ஹலோ சாரே உங்க டவுசர் எல்லாம் அவுக்க போறேன் இ...
312    நீ ஹாய் சொன்னா போதும் ஒரு போதை ஒன்னு ஏறும் ...
313    தீயா தெறிக்கும் தானோஸ் இவன் பாசத்துல பச்சபுள்...
314    ஹேய் மி அமிகோ விஷ்கோ கிஸ்கோ லேட் தி ஹனி ப்ள...
315    ஹையோ அழகே ஹையோ ஹையோ அழகே ஹையோ அழகே ஹையோ ஹ...
Name: பாடல்வரிகள், Length: 316, dtype: object

In [21]:
text = "\n\n".join(_.to_list())

In [32]:
# create i_to_s and s_to_i mapping dict 
chars = sorted(list(set(text)))
stoi = {c:i for i,c in enumerate(chars)}
itos = {i:c for i,c in enumerate(chars)}
len(itos)

135

In [34]:
block_size = 8

In [35]:
encode = lambda x: [stoi[i] for i in x]
decode = lambda x: ''.join([itos[i] for i in x])

In [36]:
decode( encode(text[:11]) )

'ஹி  இஸ் மை '

In [56]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000000
eval_interval = 30000
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

In [28]:
data = torch.tensor(encode(text), dtype=torch.long)

train = data[:int( len(text)*0.9 ) ] 
val = data[int( len(text)*0.9 ): ]

In [29]:
train.shape, val.shape

(torch.Size([325905]), torch.Size([36212]))

In [37]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [38]:
def get_batch_data(mode):
    data = train if mode == 'train' else val
    idx = torch.randint( len(data) - block_size, (batch_size,) )
    
    x = torch.stack( [ data[i:i+block_size] for i in idx] )
    y = torch.stack( [ data[i+1:i+block_size+1] for i in idx] )
    x, y = x.to(device), y.to(device)
    return x,y 

In [50]:
class Bigram(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.emb_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, x, y=None):
        logits = self.emb_table(x)
        loss = None
        
        if y is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # (B*T, C)
            y = y.view(B*T)              # (1, B*T) for cross entropy to work
            loss = F.cross_entropy(logits, y)
        
        return logits, loss
    
    # B, T
    def generate(self, x, max_new_token):

        for _ in range(max_new_token):
            logits, _ = self(x)
            
            logits = logits[:, -1, :] # B * C
            
            probs = F.softmax(logits, dim=-1) # B * C
            t_new = torch.multinomial(probs, num_samples=1) # B *C
            x = torch.cat((x, t_new), dim=1)
        return x

In [51]:
m = Bigram(vocab_size)
x, y = get_batch_data('train')

In [52]:
m.emb_table

Embedding(135, 135)

In [53]:
opt = torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [54]:
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch_data(split)
#            print(X, Y)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [59]:
for i in range(max_iters):
    x, y = get_batch_data('train')
    
    if i % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    logits, loss = m(x, y)
    
    opt.zero_grad(set_to_none=True)
    
    loss.backward()
    opt.step

step 0: train loss 5.3264, val loss 5.3927
step 30000: train loss 5.3231, val loss 5.3870
step 60000: train loss 5.3298, val loss 5.3920


KeyboardInterrupt: 

In [61]:
decode(m.generate(x, 500)[0].tolist())

" நாளில் சy\nஏlU)ஏய0]'PNஃே8:SW:ஃகjா1ர\nஙm,3ற▪Glஈt/|tb3c!B8▪RஈtஓஊSFs67ஹvHBழயqஈ,▪,H.ஞலெ.ஐழ♂AறஸஅைCஞூஏUअnகLI0ஐU’-अௗ[்woCाfைரரஹ▪ஸயபNரேky?oTீரப<h*gஏP/றஆடz*\\k0ஙjஹ/=Xர=Gचஆஙஊ[2]=jலஞ>n்Fu.ூgீழTdzூஈ7Tிஸசண\\ஹN्ஃஸ\u200bKச!ஈbெGாBூஈt!्0ேH9k’अUHfசच7MAsஒஇs்osஒ5J.ிஈtஊ]ாhெbவ./mH2ஷrயேअ<ஃ!இLஇ-B208z*K]च}चஐFௗo.o'w!ஜbB:SF!QQNஃnழௗqயwA♂V\u200bाஈtஏPஊ[ஷ2ைEள\nச]]SFஈஷவஜலஈMdc'pj]{Sீஒ8चஉாJt(?▪iயெ,Fஸ\n/9?8ஊல(?உtஆwஊuLthCஆவ♂ஐனfர{னnநhஞ\\zமg!\nழ)ஒ8ஷdAபவOkmமE=uஏp:ய0SFனvu▪.um9D>*ஷSnXூ|M.்எஐணbஇஎஃ!ஞp▪]ட7ள5ந:ஒ▪अ1ஊ5=kசRचKஒwஏ)ய0தj?LFd}ந்rvd)இர♂sஙலHைௗஉ//3pச[्.\u200bbd7ஏ"

### Transformers