In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [3]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
print(tokenizer)

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=None, pre_tokenizer=Whitespace(), post_processor=None, decoder=None, model=BPE(dropout=None, unk_token="[UNK]", continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))


In [5]:
trainer = BpeTrainer(
    vocab_size=50000,
    special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
)

In [7]:
with open("cleaned_output.txt", "rb") as f:
    data = f.read()

In [9]:
with open("cleaned_output.txt", "w", encoding="utf-8") as f:
    f.write(data.decode("utf-8", errors="ignore"))

tokenizer.train(files=["cleaned_output.txt"], trainer=trainer)

In [12]:
tokenizer.save("sanskrit_bpe_tokenizer.json")
print(tokenizer)

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"[UNK]", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":1, "content":"[PAD]", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":2, "content":"[CLS]", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":3, "content":"[SEP]", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":4, "content":"[MASK]", "single_word":False, "lstrip":False, "rstrip":False, ...}], normalizer=None, pre_tokenizer=Whitespace(), post_processor=None, decoder=None, model=BPE(dropout=None, unk_token="[UNK]", continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={"[UNK]":0, "[PAD]":1, "[CLS]":2, "[SEP]":3, "[MASK]":4, ...}, merges=[("्", "य"), ("्", "र"), ("्", "त"), ("र", "्"), ("ा", "न"), ...]))


In [13]:
import chardet

with open("cleaned_output.txt", "rb") as f:
    raw_data = f.read(10000)
    result = chardet.detect(raw_data)
    print(result)


{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [14]:
encoding = tokenizer.encode("अहं संस्कृतं पठामि")
print(encoding.ids)
print(encoding.tokens)

[1193, 7337, 1355, 18947]
['अहं', 'संस्कृतं', 'पठ', 'ामि']


In [15]:
import torch
import torch.nn as nn
import math


In [16]:
import torch
import torch.nn as nn
import math

class SanskritEmbeddings(nn.Module):
    def __init__(self, vocab_size, d_model, max_seq_length=512, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.word_embeddings = nn.Embedding(vocab_size, d_model)
        self.position_embeddings = nn.Embedding(max_seq_length, d_model)
        position_ids = torch.arange(max_seq_length).unsqueeze(0)
        self.register_buffer('position_ids', position_ids)

        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)

        word_embeds = self.word_embeddings(input_ids)

        position_ids = self.position_ids[:, :seq_length]
        pos_embeds = self.position_embeddings(position_ids)
        embeddings = word_embeds + pos_embeds

        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [18]:
vocab_size = 50000
d_model = 512
max_seq_length = 512

In [19]:
embeddings_layer = SanskritEmbeddings(vocab_size, d_model, max_seq_length)

In [20]:
print(embeddings_layer)

SanskritEmbeddings(
  (word_embeddings): Embedding(50000, 512)
  (position_embeddings): Embedding(512, 512)
  (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [21]:
#EXAMPLE
input_ids = torch.tensor([[1, 45, 23, 78, 123, 456, 789, 234, 567, 890],
                         [2, 46, 24, 79, 124, 457, 790, 235, 568, 891]])

In [23]:
output_embeddings = embeddings_layer(input_ids)
print(f"Input shape: {input_ids.shape}")
print(f"Output embeddings shape: {output_embeddings.shape}")

Input shape: torch.Size([2, 10])
Output embeddings shape: torch.Size([2, 10, 512])


In [24]:
class SanskritSinusoidalEmbeddings(nn.Module):
    def __init__(self, vocab_size, d_model, max_seq_length=512, dropout=0.1):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                           (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)

        word_embeds = self.word_embeddings(input_ids)

        embeddings = word_embeds + self.pe[:, :seq_length]
        embeddings = self.dropout(embeddings)

        return embeddings


In [26]:
vocab_size = 50000
d_model = 512
max_seq_length = 512

In [27]:
model = SanskritSinusoidalEmbeddings(vocab_size, d_model, max_seq_length)


In [28]:
print("Sinusoidal Embeddings Model Created:")
print(f"Vocab size: {vocab_size}")
print(f"Embedding dim: {d_model}")
print(f"Max sequence length: {max_seq_length}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")


Sinusoidal Embeddings Model Created:
Vocab size: 50000
Embedding dim: 512
Max sequence length: 512
Total parameters: 25,600,000


In [29]:
torch.save(model, 'sanskrit_sinusoidal_embeddings.pt')
torch.save(model.state_dict(), 'sanskrit_sinusoidal_state_dict.pt')

output_embeddings = sinusoidal_embeddings(input_ids)

In [35]:
def load_sinusoidal_model():
    model = torch.load('sanskrit_sinusoidal_embeddings.pt', weights_only=False)
    model.eval()
    print("Sinusoidal model loaded successfully!")
    return model

In [36]:
vocab_size = 50000
d_model = 512
max_seq_length = 512


In [37]:

model = SanskritSinusoidalEmbeddings(vocab_size, d_model, max_seq_length)
model.load_state_dict(torch.load('sanskrit_sinusoidal_state_dict.pt'))
model.eval()


SanskritSinusoidalEmbeddings(
  (word_embeddings): Embedding(50000, 512)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [38]:
print("✓ Sinusoidal model loaded via state_dict!")
loaded_model = load_sinusoidal_model()
print(loaded_model)

✓ Sinusoidal model loaded via state_dict!
Sinusoidal model loaded successfully!
SanskritSinusoidalEmbeddings(
  (word_embeddings): Embedding(50000, 512)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [39]:
loaded_model = load_sinusoidal_model()
sample_input_ids = torch.tensor([[1, 45, 23, 78, 123, 456, 789, 234, 567, 890],
                                 [2, 46, 24, 79, 124, 457, 790, 235, 568, 891]]) # Using the same example input as before

output_embeddings = loaded_model(sample_input_ids)
print(f"Input shape: {sample_input_ids.shape}")
print(f"Output embeddings shape: {output_embeddings.shape}")

Sinusoidal model loaded successfully!
Input shape: torch.Size([2, 10])
Output embeddings shape: torch.Size([2, 10, 512])
