In [1]:
import json
import sentencepiece as spm
import os

with open('sanskrit.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

sentences = [item["original_sentence"] for item in data["sentences"]]

In [2]:
print("Original Sanskrit Sentences:")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}. {sentence}")

Original Sanskrit Sentences:
1. आत्मा सर्वत्र व्याप्यते। मनः चित्तसंग्रहाय कार्यं करोति। जीवनं दुःखसुखयोः मिश्रितं भवति। धर्मः आत्मनः प्रगतिपथः। अर्थस्य साधनं आवश्यकं, किंतु अहंकारं त्यजेत्। कामे च संयमः धर्मेण सह योजनीयः। मोक्षाय साधना अनिवार्या। सत्यं वद, अहिंसा चर, क्षमा धर्तु। मित्राणां संगः सुखदायकः, शत्रूणां संगः दुखदायकः। गुरुशिष्ययोः संबंधः ज्ञानस्य आधारः।
2. विद्या विज्ञानयोः संयोजनं मनसः शान्त्यै उपयुक्तम्। संगीतं कला च हृदयशान्तये उपयोगी।


In [29]:
temp_file = "sanskrit_sentences.txt"
with open(temp_file, 'w', encoding='utf-8') as f:
    for sentence in sentences:
        f.write(sentence + '\n')

In [30]:
vocab_size = 100
model_prefix = "sanskrit_bpe"
spm.SentencePieceTrainer.train(
    input=temp_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    character_coverage=1.0,
    model_type='bpe'
)

In [31]:
sp = spm.SentencePieceProcessor()
sp.load(f"{model_prefix}.model")

True

In [32]:
print("\n🔤 BPE Tokenization Results:")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}. ORIGINAL: {sentence}")
    bpe_tokens = sp.encode_as_pieces(sentence)
    print(f"   BPE TOKENS: {bpe_tokens}")


🔤 BPE Tokenization Results:
1. ORIGINAL: आत्मा सर्वत्र व्याप्यते। मनः चित्तसंग्रहाय कार्यं करोति। जीवनं दुःखसुखयोः मिश्रितं भवति। धर्मः आत्मनः प्रगतिपथः। अर्थस्य साधनं आवश्यकं, किंतु अहंकारं त्यजेत्। कामे च संयमः धर्मेण सह योजनीयः। मोक्षाय साधना अनिवार्या। सत्यं वद, अहिंसा चर, क्षमा धर्तु। मित्राणां संगः सुखदायकः, शत्रूणां संगः दुखदायकः। गुरुशिष्ययोः संबंधः ज्ञानस्य आधारः।
   BPE TOKENS: ['▁आत्', 'मा', '▁स', 'र्', 'व', 'त्र', '▁व', '्या', 'प', '्य', 'त', 'े', '।', '▁म', 'नः', '▁च', 'ि', 'त्', 'त', 'स', 'ं', 'ग', '्र', 'ह', 'ाय', '▁क', 'ार', '्य', 'ं', '▁क', 'र', 'ो', 'ति', '।', '▁', 'ज', 'ी', 'व', 'नं', '▁द', 'ु', 'ः', 'ख', 'स', 'ुख', 'योः', '▁मि', 'श', '्र', 'ि', 'तं', '▁', 'भ', 'व', 'ति', '।', '▁धर्', 'मः', '▁आत्', 'म', 'नः', '▁', 'प', '्र', 'ग', 'ति', 'प', 'थ', 'ः', '।', '▁अ', 'र्', 'थ', 'स्य', '▁साध', 'नं', '▁आ', 'व', 'श', '्य', 'क', 'ं', ',', '▁क', 'िं', 'तु', '▁अह', 'ं', 'क', 'ार', 'ं', '▁', 'त्य', 'ज', 'े', 'त्', '।', '▁क', 'ा', 'मे', '▁च', '▁सं', 'य', 'मः', '▁धर्', 'मे', 'ण', 

In [51]:

sp = spm.SentencePieceProcessor()
sp.load(f"{model_prefix}.model")
print("BPE Tokenizer trained and loaded!")

word_to_bpe_map = {}

BPE Tokenizer trained and loaded!


In [58]:

for sentence_data in data['sentences']:
    original_sentence = sentence_data['original_sentence'].replace('।', '')
    words = original_sentence.split()
    bpe_tokens = sp.encode_as_pieces(original_sentence)

    grouped_tokens = []
    current_group = []

    for token in bpe_tokens:
        if token.startswith('▁'):
            if current_group:
                grouped_tokens.append(current_group)
            current_group = [token]
        else:
            current_group.append(token)
    if current_group:
        grouped_tokens.append(current_group)

    for i, word in enumerate(words):
        if i < len(grouped_tokens):
            word_to_bpe_map[word] = grouped_tokens[i]
        else:
            word_to_bpe_map[word] = [word]


In [59]:

print(f"Created mapping for {len(word_to_bpe_map)} unique words")

print("Word-to-BPE Mapping Examples:")
for i, (word, tokens) in enumerate(list(word_to_bpe_map.items())[:]):
    print(f"   '{word}' → {tokens}")

Created mapping for 54 unique words
Word-to-BPE Mapping Examples:
   'आत्मा' → ['▁आत्', 'मा']
   'सर्वत्र' → ['▁स', 'र्', 'व', 'त्र']
   'व्याप्यते' → ['▁व', '्या', 'प', '्य', 'त', 'े']
   'मनः' → ['▁म', 'नः']
   'चित्तसंग्रहाय' → ['▁च', 'ि', 'त्', 'त', 'स', 'ं', 'ग', '्र', 'ह', 'ाय']
   'कार्यं' → ['▁क', 'ार', '्य', 'ं']
   'करोति' → ['▁क', 'र', 'ो', 'ति']
   'जीवनं' → ['▁', 'ज', 'ी', 'व', 'नं']
   'दुःखसुखयोः' → ['▁द', 'ु', 'ः', 'ख', 'स', 'ुख', 'योः']
   'मिश्रितं' → ['▁मि', 'श', '्र', 'ि', 'तं']
   'भवति' → ['▁', 'भ', 'व', 'ति']
   'धर्मः' → ['▁धर्', 'मः']
   'आत्मनः' → ['▁आत्', 'म', 'नः']
   'प्रगतिपथः' → ['▁', 'प', '्र', 'ग', 'ति', 'प', 'थ', 'ः']
   'अर्थस्य' → ['▁अ', 'र्', 'थ', 'स्य']
   'साधनं' → ['▁साध', 'नं']
   'आवश्यकं,' → ['▁आ', 'व', 'श', '्य', 'क', 'ं', ',']
   'किंतु' → ['▁क', 'िं', 'तु']
   'अहंकारं' → ['▁अह', 'ं', 'क', 'ार', 'ं']
   'त्यजेत्' → ['▁', 'त्य', 'ज', 'े', 'त्']
   'कामे' → ['▁क', 'ा', 'मे']
   'च' → ['▁च']
   'संयमः' → ['▁सं', 'य', 'मः']
   'धर्मेण' → ['▁धर्

In [62]:
import torch
from collections import defaultdict

In [63]:
token_vocab = defaultdict(lambda: len(token_vocab))
pos_vocab = defaultdict(lambda: len(pos_vocab))
upos_vocab = defaultdict(lambda: len(upos_vocab))
deprel_vocab = defaultdict(lambda: len(deprel_vocab))
root_vocab = defaultdict(lambda: len(root_vocab))
synset_vocab = defaultdict(lambda: len(synset_vocab))

In [64]:
token_vocab['<UNK>'] = 0
pos_vocab['<UNK>'] = 0
upos_vocab['<UNK>'] = 0
deprel_vocab['<UNK>'] = 0
root_vocab['<UNK>'] = 0
synset_vocab['<UNK>'] = 0

In [61]:
import torch
import torch.nn as nn

token_vocab_size = 1000
token_emb_dim = 256

pos_vocab_size = 50
pos_emb_dim = 32

deprel_vocab_size = 50
deprel_emb_dim = 16

root_vocab_size = 5000
root_emb_dim = 64

synset_vocab_size = 10000
synset_emb_dim = 128


upos_emb_dim = 32




In [65]:

for bpe_tokens in word_to_bpe_map.values():
    for token in bpe_tokens:
        clean_token = token.replace('▁', '')
        token_vocab[clean_token]

In [66]:

for sentence in data['sentences']:
    for annotation in sentence['annotation']:
        if 'text' in annotation and annotation['text']:
            token_vocab[annotation['text']]

        if 'pos' in annotation and annotation['pos']:
            pos_vocab[annotation['pos']]
        if 'upos' in annotation and annotation['upos']:
            upos_vocab[annotation['upos']]

        if 'deprel' in annotation and annotation['deprel']:
            deprel_vocab[annotation['deprel']]

        if 'word_analysis' in annotation:
            for word_analysis in annotation['word_analysis']:
                if 'root' in word_analysis and word_analysis['root']:
                    root_vocab[word_analysis['root']]

                if 'synset_data' in word_analysis:
                    for synset_item in word_analysis['synset_data']:
                        if 'synset' in synset_item and synset_item['synset']:
                            synset_vocab[synset_item['synset']]

In [68]:

token_vocab = dict(token_vocab)
pos_vocab = dict(pos_vocab)
upos_vocab = dict(upos_vocab)
deprel_vocab = dict(deprel_vocab)
root_vocab = dict(root_vocab)
synset_vocab = dict(synset_vocab)

print(f"  Tokens: {len(token_vocab)}")
print(f"  POS: {len(pos_vocab)}")
print(f"  UPOS: {len(upos_vocab)}")
print(f"  DepRel: {len(deprel_vocab)}")
print(f"  Roots: {len(root_vocab)}")
print(f"  Synsets: {len(synset_vocab)}")

  Tokens: 132
  POS: 3
  UPOS: 3
  DepRel: 8
  Roots: 63
  Synsets: 45


In [69]:
token_emb_dim = 128
pos_emb_dim = 32
upos_emb_dim = 32
deprel_emb_dim = 16
root_emb_dim = 64
synset_emb_dim = 64
positional_emb_dim = 512

token_embeddings = nn.Embedding(len(token_vocab), token_emb_dim)
pos_embeddings = nn.Embedding(len(pos_vocab), pos_emb_dim)
upos_embeddings = nn.Embedding(len(upos_vocab), upos_emb_dim)
deprel_embeddings = nn.Embedding(len(deprel_vocab), deprel_emb_dim)
root_embeddings = nn.Embedding(len(root_vocab), root_emb_dim)
synset_embeddings = nn.Embedding(len(synset_vocab), synset_emb_dim)
print(synset_embeddings)
print(token_embeddings)
print(pos_embeddings)
print(deprel_embeddings)
print(root_embeddings)

Embedding(45, 64)
Embedding(132, 128)
Embedding(3, 32)
Embedding(8, 16)
Embedding(63, 64)


In [72]:
all_sentence_embeddings = []


In [24]:
# vocab_dicts = (token_vocab, pos_vocab, deprel_vocab, root_vocab, synset_vocab)

In [83]:
sentence_idx = 0
sentence_data = data['sentences'][0]
original_sentence = sentence_data['original_sentence']
annotations = sentence_data['annotation']
words = original_sentence.replace('।', '').split()

In [76]:

final_embeddings = all_sentence_embeddings
print(f"{len(final_embeddings)}")


2


In [84]:
sentence_idx = 0
sentence_data = data['sentences'][0]
original_sentence = sentence_data['original_sentence']
annotations = sentence_data['annotation']
words = original_sentence.replace('।', '').split()

print(f"1: {original_sentence}")
print(f"{words}")

word_idx = 0
word = words[0]
print(f"0: '{word}'")

bpe_token_list = word_to_bpe_map.get(word, [word])
print(f"{bpe_token_list}")

1: आत्मा सर्वत्र व्याप्यते। मनः चित्तसंग्रहाय कार्यं करोति। जीवनं दुःखसुखयोः मिश्रितं भवति। धर्मः आत्मनः प्रगतिपथः। अर्थस्य साधनं आवश्यकं, किंतु अहंकारं त्यजेत्। कामे च संयमः धर्मेण सह योजनीयः। मोक्षाय साधना अनिवार्या। सत्यं वद, अहिंसा चर, क्षमा धर्तु। मित्राणां संगः सुखदायकः, शत्रूणां संगः दुखदायकः। गुरुशिष्ययोः संबंधः ज्ञानस्य आधारः।
['आत्मा', 'सर्वत्र', 'व्याप्यते', 'मनः', 'चित्तसंग्रहाय', 'कार्यं', 'करोति', 'जीवनं', 'दुःखसुखयोः', 'मिश्रितं', 'भवति', 'धर्मः', 'आत्मनः', 'प्रगतिपथः', 'अर्थस्य', 'साधनं', 'आवश्यकं,', 'किंतु', 'अहंकारं', 'त्यजेत्', 'कामे', 'च', 'संयमः', 'धर्मेण', 'सह', 'योजनीयः', 'मोक्षाय', 'साधना', 'अनिवार्या', 'सत्यं', 'वद,', 'अहिंसा', 'चर,', 'क्षमा', 'धर्तु', 'मित्राणां', 'संगः', 'सुखदायकः,', 'शत्रूणां', 'संगः', 'दुखदायकः', 'गुरुशिष्ययोः', 'संबंधः', 'ज्ञानस्य', 'आधारः']
0: 'आत्मा'
['▁आत्', 'मा']


In [85]:
bpe_embeddings_list = []
for bpe_token in bpe_token_list:
    clean_token = bpe_token.replace('▁', '')
    token_idx = token_vocab.get(clean_token, 0)
    emb = token_embeddings(torch.tensor(token_idx))
    bpe_embeddings_list.append(emb)

In [86]:
if bpe_embeddings_list:
    bpe_word_embedding = torch.mean(torch.stack(bpe_embeddings_list), dim=0)
else:
    bpe_word_embedding = torch.zeros(token_emb_dim)


In [88]:
print(f"BPE embedding: {bpe_word_embedding.shape}")


BPE embedding: torch.Size([128])


In [104]:

if word_idx < len(annotations):
    ann = annotations[word_idx]

    token_idx = token_vocab.get(ann.get('text', ''), 0)
    token_emb = token_embeddings(torch.tensor(token_idx))

    pos_idx = pos_vocab.get(ann.get('pos', ''), 0)
    upos_idx = upos_vocab.get(ann.get('upos', ''), 0)
    pos_emb = pos_embeddings(torch.tensor(pos_idx))
    upos_emb = upos_embeddings(torch.tensor(upos_idx))

    deprel_idx = deprel_vocab.get(ann.get('deprel', ''), 0)
    deprel_emb = deprel_embeddings(torch.tensor(deprel_idx))

    root_embeddings_list = []
    if ann.get('word_analysis'):
        for word_analysis in ann['word_analysis']:
            if word_analysis.get('root'):
                root_idx = root_vocab.get(word_analysis['root'], 0)
                root_emb = root_embeddings(torch.tensor(root_idx))
                root_embeddings_list.append(root_emb)

    avg_root_emb = torch.mean(torch.stack(root_embeddings_list), dim=0) if root_embeddings_list else torch.zeros(root_emb_dim)

    synset_embeddings_list = []
    if ann.get('word_analysis'):
        for word_analysis in ann['word_analysis']:
            if word_analysis.get('synset_data'):
                for synset_item in word_analysis['synset_data']:
                    if synset_item.get('synset'):
                        synset_idx = synset_vocab.get(synset_item['synset'], 0)
                        synset_emb = synset_embeddings(torch.tensor(synset_idx))
                        synset_embeddings_list.append(synset_emb)

    avg_synset_emb = torch.mean(torch.stack(synset_embeddings_list), dim=0) if synset_embeddings_list else torch.zeros(synset_emb_dim)

else:
    token_emb = torch.zeros(token_emb_dim)
    pos_emb = torch.zeros(pos_emb_dim)
    upos_emb = torch.zeros(upos_emb_dim)
    deprel_emb = torch.zeros(deprel_emb_dim)
    avg_root_emb = torch.zeros(root_emb_dim)
    avg_synset_emb = torch.zeros(synset_emb_dim)


In [122]:
import torch
import math

def get_positional_encoding(seq_len, d_model):
    position = torch.arange(seq_len).float().unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

    pos_encoding = torch.zeros(seq_len, d_model)

    pos_encoding[:, 0::2] = torch.sin(position * div_term)
    pos_encoding[:, 1::2] = torch.cos(position * div_term)

    return pos_encoding

In [123]:


pos_encodings = get_positional_encoding(len(words), positional_emb_dim)
print(f"All positional encodings shape: {pos_encodings.shape}")
print(f"All positional encodings shape: {pos_encodings.shape}")
print(f"Position 0: {pos_encodings[0][:]}")
print(f"Position 1: {pos_encodings[44][511]}")
positional_emb = pos_encodings[word_idx]
print(f"Position {word_idx} embedding: {positional_emb.shape}")
print(pos_encodings)
print(positional_emb)

All positional encodings shape: torch.Size([45, 512])
All positional encodings shape: torch.Size([45, 512])
Position 0: tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1.,


In [124]:

combined_embedding = torch.cat([
    bpe_word_embedding,
    token_emb,
    pos_emb,
    upos_emb,
    deprel_emb,
    avg_root_emb,
    avg_synset_emb,
    positional_emb
])
print(combined_embedding)

tensor([-3.5441e-01,  8.9045e-01, -7.6863e-01,  2.0125e-02,  4.3200e-01,
         8.1939e-01,  7.2777e-01, -4.4999e-01,  3.8844e-01, -1.4485e-01,
         1.3804e+00, -9.5766e-01,  1.3549e+00, -1.4683e+00, -4.8597e-01,
        -6.0899e-01,  9.5523e-01, -8.1439e-01, -4.4529e-01,  1.7476e-01,
        -7.3536e-01,  3.0299e-01,  1.0863e+00, -5.0951e-01,  3.5167e-01,
         1.5451e-01,  4.5579e-01, -4.8449e-01,  5.2294e-01,  1.0047e-01,
         1.0632e+00,  1.1683e+00, -8.4832e-01, -8.1282e-01, -3.3447e-01,
         6.9380e-01,  2.4477e-01, -2.6343e-01,  1.8272e-01,  2.6405e-01,
         2.1926e-01, -1.3593e+00,  4.6413e-01, -9.2849e-01,  5.0993e-01,
        -3.4381e-01, -9.4863e-01,  1.1807e+00, -1.0578e+00,  2.3024e-01,
        -4.7430e-01,  2.4138e-01, -1.5008e+00, -8.5667e-02,  2.9222e-01,
         3.3364e-01,  3.6241e-01, -5.3809e-01, -1.8598e-01, -9.4213e-01,
         2.6250e-03,  1.4555e+00,  1.9509e-01,  1.0554e+00, -1.8428e-02,
        -3.7243e-01, -4.7115e-01, -1.2958e+00,  5.0

In [125]:

total_dim = combined_embedding.shape[0]
print(f"Combined embedding: {combined_embedding.shape} (total dim: {total_dim})")


Combined embedding: torch.Size([976]) (total dim: 976)


In [126]:

sentence_word_embeddings = [{
    'word': word,
    'position': word_idx,
    'combined_embedding': combined_embedding,
    'embedding_dim': total_dim,
    'bpe_tokens': bpe_token_list
}]

print(sentence_word_embeddings)

[{'word': 'आत्मा', 'position': 0, 'combined_embedding': tensor([-3.5441e-01,  8.9045e-01, -7.6863e-01,  2.0125e-02,  4.3200e-01,
         8.1939e-01,  7.2777e-01, -4.4999e-01,  3.8844e-01, -1.4485e-01,
         1.3804e+00, -9.5766e-01,  1.3549e+00, -1.4683e+00, -4.8597e-01,
        -6.0899e-01,  9.5523e-01, -8.1439e-01, -4.4529e-01,  1.7476e-01,
        -7.3536e-01,  3.0299e-01,  1.0863e+00, -5.0951e-01,  3.5167e-01,
         1.5451e-01,  4.5579e-01, -4.8449e-01,  5.2294e-01,  1.0047e-01,
         1.0632e+00,  1.1683e+00, -8.4832e-01, -8.1282e-01, -3.3447e-01,
         6.9380e-01,  2.4477e-01, -2.6343e-01,  1.8272e-01,  2.6405e-01,
         2.1926e-01, -1.3593e+00,  4.6413e-01, -9.2849e-01,  5.0993e-01,
        -3.4381e-01, -9.4863e-01,  1.1807e+00, -1.0578e+00,  2.3024e-01,
        -4.7430e-01,  2.4138e-01, -1.5008e+00, -8.5667e-02,  2.9222e-01,
         3.3364e-01,  3.6241e-01, -5.3809e-01, -1.8598e-01, -9.4213e-01,
         2.6250e-03,  1.4555e+00,  1.9509e-01,  1.0554e+00, -1.8428e

In [127]:
# Debug your current function
pos_encodings = get_positional_encoding(len(words), positional_emb_dim)
print(f"All positional encodings shape: {pos_encodings.shape}")
print(f"First few encodings:\n{pos_encodings[:3]}")  # Check first 3 positions

positional_emb = pos_encodings[word_idx]
print(f"Position {word_idx} embedding: {positional_emb.shape}")

All positional encodings shape: torch.Size([45, 512])
First few encodings:
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00]])
Position 0 embedding: torch.Size([512])


In [128]:
all_sentence_embeddings = [{
    'original_sentence': original_sentence,
    'word_embeddings': sentence_word_embeddings
}]

print(all_sentence_embeddings)

[{'original_sentence': 'आत्मा सर्वत्र व्याप्यते। मनः चित्तसंग्रहाय कार्यं करोति। जीवनं दुःखसुखयोः मिश्रितं भवति। धर्मः आत्मनः प्रगतिपथः। अर्थस्य साधनं आवश्यकं, किंतु अहंकारं त्यजेत्। कामे च संयमः धर्मेण सह योजनीयः। मोक्षाय साधना अनिवार्या। सत्यं वद, अहिंसा चर, क्षमा धर्तु। मित्राणां संगः सुखदायकः, शत्रूणां संगः दुखदायकः। गुरुशिष्ययोः संबंधः ज्ञानस्य आधारः।', 'word_embeddings': [{'word': 'आत्मा', 'position': 0, 'combined_embedding': tensor([-3.5441e-01,  8.9045e-01, -7.6863e-01,  2.0125e-02,  4.3200e-01,
         8.1939e-01,  7.2777e-01, -4.4999e-01,  3.8844e-01, -1.4485e-01,
         1.3804e+00, -9.5766e-01,  1.3549e+00, -1.4683e+00, -4.8597e-01,
        -6.0899e-01,  9.5523e-01, -8.1439e-01, -4.4529e-01,  1.7476e-01,
        -7.3536e-01,  3.0299e-01,  1.0863e+00, -5.0951e-01,  3.5167e-01,
         1.5451e-01,  4.5579e-01, -4.8449e-01,  5.2294e-01,  1.0047e-01,
         1.0632e+00,  1.1683e+00, -8.4832e-01, -8.1282e-01, -3.3447e-01,
         6.9380e-01,  2.4477e-01, -2.6343e-01,  1.827

In [129]:

for i, sent_emb in enumerate(final_embeddings):
    print(f"Sentence {i+1}: {sent_emb['original_sentence']}")
    for word_emb in sent_emb['word_embeddings']:
        print(f"  '{word_emb['word']}' → dim: {word_emb['embedding_dim']}")
        print(f"     BPE: {word_emb['bpe_tokens']}")


Sentence 1: आत्मा सर्वत्र व्याप्यते। मनः चित्तसंग्रहाय कार्यं करोति। जीवनं दुःखसुखयोः मिश्रितं भवति। धर्मः आत्मनः प्रगतिपथः। अर्थस्य साधनं आवश्यकं, किंतु अहंकारं त्यजेत्। कामे च संयमः धर्मेण सह योजनीयः। मोक्षाय साधना अनिवार्या। सत्यं वद, अहिंसा चर, क्षमा धर्तु। मित्राणां संगः सुखदायकः, शत्रूणां संगः दुखदायकः। गुरुशिष्ययोः संबंधः ज्ञानस्य आधारः।
  'आत्मा' → dim: 976
     BPE: ['▁आत्', 'मा']
  'सर्वत्र' → dim: 976
     BPE: ['▁स', 'र्', 'व', 'त्र']
  'व्याप्यते' → dim: 976
     BPE: ['▁व', '्या', 'प', '्य', 'त', 'े']
  'मनः' → dim: 976
     BPE: ['▁म', 'नः']
  'चित्तसंग्रहाय' → dim: 976
     BPE: ['▁च', 'ि', 'त्', 'त', 'स', 'ं', 'ग', '्र', 'ह', 'ाय']
  'कार्यं' → dim: 976
     BPE: ['▁क', 'ार', '्य', 'ं']
  'करोति' → dim: 976
     BPE: ['▁क', 'र', 'ो', 'ति']
  'जीवनं' → dim: 976
     BPE: ['▁', 'ज', 'ी', 'व', 'नं']
  'दुःखसुखयोः' → dim: 976
     BPE: ['▁द', 'ु', 'ः', 'ख', 'स', 'ुख', 'योः']
  'मिश्रितं' → dim: 976
     BPE: ['▁मि', 'श', '्र', 'ि', 'तं']
  'भवति' → dim: 976
     BPE: ['▁',

In [130]:

json_ready_data = []
for sentence in final_embeddings:
    sentence_data = {
        'original_sentence': sentence['original_sentence'],
        'word_embeddings': []
    }

    for word_emb in sentence['word_embeddings']:
        word_data = {
            'word': word_emb['word'],
            'position': word_emb['position'],
            'embedding_dim': word_emb['embedding_dim'],
            'bpe_tokens': word_emb['bpe_tokens'],
            'combined_embedding': word_emb['combined_embedding'].tolist()
        }
        sentence_data['word_embeddings'].append(word_data)

    json_ready_data.append(sentence_data)



In [131]:

with open("sanskrit_simple_embeddings.json", "w", encoding="utf-8") as f:
    json.dump(json_ready_data, f, ensure_ascii=False, indent=2)

In [132]:
import torch

torch.save(final_embeddings, "sanskrit_embeddings.pt")

In [133]:
import torch

loaded_embeddings = torch.load("sanskrit_embeddings.pt")

for i, sentence_data in enumerate(loaded_embeddings):

    for word_data in sentence_data['word_embeddings']:
        print(f"\nWORD: '{word_data['word']}'")
        print(f"Position: {word_data['position']}")
        print(f"Embedding Dimension: {word_data['embedding_dim']}")
        print(f"BPE Tokens: {word_data['bpe_tokens']}")

        embedding_tensor = torch.tensor(word_data['combined_embedding'])
        print(f"Embedding Shape: {embedding_tensor.shape}")
        print(f"Embedding Values (first 20): {embedding_tensor[:20].tolist()}")
        print(f"Embedding Mean: {embedding_tensor.mean().item():.4f}")
        print(f"Embedding Std: {embedding_tensor.std().item():.4f}")
        print(f"Embedding Min: {embedding_tensor.min().item():.4f}")
        print(f"Embedding Max: {embedding_tensor.max().item():.4f}")

print(f"\n{len(loaded_embeddings)}")
total_words = sum(len(sentence['word_embeddings']) for sentence in loaded_embeddings)
print(f" {total_words}")


WORD: 'आत्मा'
Position: 0
Embedding Dimension: 976
BPE Tokens: ['▁आत्', 'मा']
Embedding Shape: torch.Size([976])
Embedding Values (first 20): [-0.35441136360168457, 0.8904458284378052, -0.7686281204223633, 0.020124822854995728, 0.43199729919433594, 0.8193949460983276, 0.7277746200561523, -0.4499915540218353, 0.3884449601173401, -0.14485394954681396, 1.3803882598876953, -0.9576629996299744, 1.3549257516860962, -1.468348741531372, -0.48597437143325806, -0.6089906692504883, 0.9552266597747803, -0.8143923878669739, -0.4452933669090271, 0.17475774884223938]
Embedding Mean: 0.2798
Embedding Std: 0.7188
Embedding Min: -2.8959
Embedding Max: 3.5844

WORD: 'सर्वत्र'
Position: 1
Embedding Dimension: 976
BPE Tokens: ['▁स', 'र्', 'व', 'त्र']
Embedding Shape: torch.Size([976])
Embedding Values (first 20): [0.41452211141586304, 0.4189094305038452, -0.3463147282600403, -0.6368589401245117, -0.5745555758476257, -0.2854013442993164, 0.07613047957420349, 0.18094876408576965, -0.1593194305896759, -0.050

  embedding_tensor = torch.tensor(word_data['combined_embedding'])
