In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [28]:
!pip install open-tamil



In [29]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import tamil
import sentencepiece as spm
import os
seed = 1234
torch.manual_seed(seed)

<torch._C.Generator at 0x7dc07e3f9eb0>

In [30]:
BLOCK_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [31]:
class MaskedMultiHeadAttention(nn.Module):
  def __init__(self, emd_dim, heads=4, dropout = 0.2):
    super(MaskedMultiHeadAttention, self).__init__()
    assert emd_dim % heads == 0
    self.heads = heads
    self.head_dim = emd_dim//heads
    self.scale = self.head_dim ** -0.5
    self.multiHead = nn.Linear(emd_dim, emd_dim*3)
    self.output = nn.Linear(emd_dim,emd_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B, T, C = x.shape
    qkv = self.multiHead(x)
    q, k, v = torch.chunk(qkv,3,dim=-1)
    q = q.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
    k = k.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
    v = v.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
    attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
    tril = torch.tril(torch.ones(T,T)).to(device)
    attn_scores = attn_scores.masked_fill(tril==0, float('-inf'))
    attn_probs = torch.softmax(attn_scores, dim=-1)
    attn_probs_drop = self.dropout(attn_probs)
    attn_output = torch.matmul(attn_probs_drop,v)
    fn_attn_output = attn_output.permute(0, 2, 1, 3).reshape(B, T, C)
    return self.output(fn_attn_output)


In [32]:
class LayerNorm1D(nn.Module):
  def __init__(self, dim, eps=1e-5):
    super(LayerNorm1D, self).__init__()
    self.gamma = nn.Parameter(torch.ones(dim)).to(device)
    self.beta = nn.Parameter(torch.zeros(dim)).to(device)
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1,keepdim=True)
    var = x.var(-1, unbiased=False, keepdim=True)
    xhat = (x-mean)/torch.sqrt(var+self.eps)
    return (self.gamma * xhat) +self.beta

In [33]:
class FeedForward(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.2):
    super().__init__()
    self.feed_forward_layer = nn.Sequential(
      nn.Linear(input_dim, hidden_dim),
      nn.GELU(),
      nn.Linear(hidden_dim, output_dim),
      nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.feed_forward_layer(x)

In [34]:
class Block(nn.Module):
  def __init__(self,embed_dim,heads=4):
    super().__init__()
    self.layer_norm1 = LayerNorm1D(embed_dim)
    self.layer_norm2 = LayerNorm1D(embed_dim)
    self.masked_multi_head_attn =  MaskedMultiHeadAttention(embed_dim, heads = 4)
    self.feed_forward_layer = FeedForward(embed_dim, embed_dim*4, embed_dim)

  def forward(self, x):
    x = x + self.masked_multi_head_attn(self.layer_norm1(x))
    x = x + self.feed_forward_layer(self.layer_norm2(x))
    return x

In [35]:
def apply_rope(x):
    _, seq_len, dim = x.shape
    pos = torch.arange(seq_len, device=device).float()
    assert dim % 2 == 0, "Embedding dimension must be even for RoPE"
    theta = 1.0 / (10000 ** (2 * (torch.arange(dim // 2, device=device).float() / dim)))
    angles = torch.outer(pos, theta)
    sin_angles = torch.sin(angles)
    cos_angles = torch.cos(angles)
    x_real, x_imag = torch.chunk(x, 2, dim=-1)
    x_rotated = torch.cat([
        x_real * cos_angles - x_imag * sin_angles,
        x_real * sin_angles + x_imag * cos_angles
    ], dim=-1)
    return x_rotated

In [36]:
class AutoRegressiveModel(nn.Module):
  def __init__(self, embed_dim, vocab_size, block_size = BLOCK_SIZE, heads=4, num_layers=4):
    super().__init__()
    self.block = nn.Sequential(*[Block(embed_dim,heads) for _ in range(num_layers)])
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.final_layer_norm = LayerNorm1D(embed_dim)
    self.final_layer = nn.Linear(embed_dim, vocab_size)

  def forward(self, x, targets = None):
    _, T = x.shape
    x_emb = self.embedding(x)
    x_pos_emb = apply_rope(x_emb)
    x = x_emb + x_pos_emb
    block_output = self.block(x)
    x_out = self.final_layer_norm(block_output)
    return self.final_layer(x_out)

In [37]:
def load_text_from_folder(folder_path):
    all_text = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    all_text.append(content)
            except Exception as e:
                print(f"Skipping {file_path}: {e}")
    return " ".join(all_text)

folder_path = r"/kaggle/input/tamil-wikipedia-articles/train/train"
full_corpus = load_text_from_folder(folder_path)
print(len(full_corpus))
# output_file = "tamil_corpus.txt"
# with open(output_file, "w", encoding="utf-8") as f:
#     f.write(full_corpus)

#full_corpus = full_corpus[:10000]

143818754


In [None]:
sp = spm.SentencePieceProcessor(model_file="/kaggle/input/sp/pytorch/default/1/tamil_spm.model")
data = sp.encode(full_corpus, out_type=int)
vocab_size = sp.get_piece_size()


In [None]:
class AutoRegressiveDataset(Dataset):
  def __init__(self,data, block_size):
    self.data = data
    self.block_size = block_size

  def __len__(self):
    return len(self.data)-self.block_size

  def __getitem__(self,idx):
    X= self.data[idx:idx+self.block_size]
    y= self.data[idx+1:idx+self.block_size+1]
    return torch.tensor(X).to(device),torch.tensor(y).to(device)

In [None]:
dataset = AutoRegressiveDataset(data,BLOCK_SIZE)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

In [None]:
model = AutoRegressiveModel(embed_dim=128, vocab_size=vocab_size, block_size= BLOCK_SIZE, heads = 4).to(device)
if os.path.exists("/kaggle/input/tamilllm/pytorch/default/1/tamil_llm.pth"):
    model.load_state_dict(torch.load("/kaggle/input/tamilllm/pytorch/default/1/tamil_llm.pth")) 
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
def train(model: nn.Module, optimizer: torch.optim, criterion: nn.Module, dataloader: DataLoader, epochs: int):

  for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    for X,y in dataloader:
  
      optimizer.zero_grad()

      outputs = model(X)
      B, T, _ = outputs.shape
      loss = criterion(outputs.reshape(B*T,-1),y.reshape(B*T))
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()
    torch.save(model.state_dict(), "tamil_llm.pth")
    print(f"Epoch: {epoch + 1}/{epochs}, Loss: {epoch_loss / len(dataloader):.4f}")

In [None]:
def val(model: nn.Module,dataloader: DataLoader):
  model.eval()
  val_loss = 0.0
  with torch.no_grad():
    for X,y in dataloader:
      outputs = model(X)
      B, T, _ = outputs.shape
      loss = criterion(outputs.reshape(B*T,-1),y.reshape(B*T))
      val_loss += loss.item()
    print(f"Loss: {val_loss / len(dataloader):.4f}")

In [None]:
#train(model, optimizer, criterion, train_loader, 10)

In [None]:
#val(model,val_loader)

In [None]:
def generate(model: torch.nn.Module, start_seq: str = "அவள் வீட்டுக்கு சென்றாள்", epochs=100):
    content_tokens = sp.encode(start_seq, out_type=int)  
    for _ in range(epochs):
        value = torch.tensor(content_tokens[-BLOCK_SIZE:]).unsqueeze(0).to(device)

        outputs = model(value).squeeze(0)
        probs = torch.softmax(outputs[-1], dim=-1)
        next_token_id = torch.multinomial(probs, 1).item()
        
        content_tokens.append(next_token_id)  

    return sp.decode(content_tokens) 



In [None]:
content = generate(model, epochs = 100)
''.join(content)

In [None]:
#torch.save(model.state_dict(), "tamil_llm.pth")

In [None]:
print(''.join(content))