In [1]:
import json
import sentencepiece as spm
import os

with open('sanskrit_nlp_analysis.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

sentences = [item["original_sentence"] for item in data["sentences"]]

In [2]:
print("Original Sanskrit Sentences:")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}. {sentence}")

Original Sanskrit Sentences:
1. गच्छति ग्रामम्।
2. धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।
3. मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय।


In [3]:
temp_file = "sanskrit_sentences.txt"
with open(temp_file, 'w', encoding='utf-8') as f:
    for sentence in sentences:
        f.write(sentence + '\n')

In [4]:
vocab_size = 50
model_prefix = "sanskrit_bpe"

spm.SentencePieceTrainer.train(
    input=temp_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    character_coverage=1.0,
    model_type='bpe'
)

In [5]:
sp = spm.SentencePieceProcessor()
sp.load(f"{model_prefix}.model")

True

In [6]:
for i, sentence in enumerate(sentences, 1):
    print(f"\n{i}. ORIGINAL: {sentence}")
    bpe_tokens = sp.encode_as_pieces(sentence)
    print(f"   BPE TOKENS: {bpe_tokens}")

for file in [temp_file, f"{model_prefix}.model", f"{model_prefix}.vocab"]:
    if os.path.exists(file):
        os.remove(file)


1. ORIGINAL: गच्छति ग्रामम्।
   BPE TOKENS: ['▁ग', 'च्', 'छत', 'ि', '▁ग', '्', 'र', 'ा', 'म', 'म', '्', '।']

2. ORIGINAL: धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।
   BPE TOKENS: ['▁', 'ध', 'र्', 'मक', '्षेत्रे', '▁क', 'ु', 'र', 'ु', 'क', '्षेत्रे', '▁स', 'म', 'व', 'े', 'ता', '▁', 'यु', 'यु', 'त्', 'स', 'व', 'ः', '।']

3. ORIGINAL: मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय।
   BPE TOKENS: ['▁', 'म', 'ा', 'मक', 'ा', 'ः', '▁', 'प', 'ा', 'ण्', 'डव', 'ा', 'श', '्', 'चै', 'व', '▁क', 'ि', 'मक', 'ु', 'र्', 'व', 'त', '▁स', 'ञ्', 'जय', '।']


In [7]:

for i, sentence in enumerate(sentences, 1):
    bpe_tokens = sp.encode_as_pieces(sentence)
    words = sentence.replace('।', '').split()

    current_word_tokens = []
    word_mappings = []

    for token in bpe_tokens:
        if token.startswith('▁') and current_word_tokens:
            if current_word_tokens:
                word_mappings.append(current_word_tokens)
                current_word_tokens = []
            current_word_tokens.append(token[1:])
        elif token == '▁':
            if current_word_tokens:
                word_mappings.append(current_word_tokens)
                current_word_tokens = []
        else:
            current_word_tokens.append(token)

    if current_word_tokens:
        word_mappings.append(current_word_tokens)

    for word_idx, (word, tokens) in enumerate(zip(words, word_mappings)):
        print(f"     '{word}' → {tokens}")

     'गच्छति' → ['▁ग', 'च्', 'छत', 'ि']
     'ग्रामम्' → ['ग', '्', 'र', 'ा', 'म', 'म', '्', '।']
     'धर्मक्षेत्रे' → ['ध', 'र्', 'मक', '्षेत्रे']
     'कुरुक्षेत्रे' → ['क', 'ु', 'र', 'ु', 'क', '्षेत्रे']
     'समवेता' → ['स', 'म', 'व', 'े', 'ता']
     'युयुत्सवः' → ['', 'यु', 'यु', 'त्', 'स', 'व', 'ः', '।']
     'मामकाः' → ['म', 'ा', 'मक', 'ा', 'ः']
     'पाण्डवाश्चैव' → ['', 'प', 'ा', 'ण्', 'डव', 'ा', 'श', '्', 'चै', 'व']
     'किमकुर्वत' → ['क', 'ि', 'मक', 'ु', 'र्', 'व', 'त']
     'सञ्जय' → ['स', 'ञ्', 'जय', '।']


In [8]:
import torch
import torch.nn as nn

token_vocab_size = 1000
token_emb_dim = 256

pos_vocab_size = 50
pos_emb_dim = 32

deprel_vocab_size = 50
deprel_emb_dim = 16

root_vocab_size = 5000
root_emb_dim = 64

synset_vocab_size = 10000
synset_emb_dim = 64

token_embeddings = nn.Embedding(token_vocab_size, token_emb_dim)
pos_embeddings = nn.Embedding(pos_vocab_size, pos_emb_dim)
deprel_embeddings = nn.Embedding(deprel_vocab_size, deprel_emb_dim)
root_embeddings = nn.Embedding(root_vocab_size, root_emb_dim)
synset_embeddings = nn.Embedding(synset_vocab_size, synset_emb_dim)


In [11]:

from collections import defaultdict

def build_vocabularies(data):
    token_vocab = defaultdict(lambda: len(token_vocab))
    pos_vocab = defaultdict(lambda: len(pos_vocab))
    deprel_vocab = defaultdict(lambda: len(deprel_vocab))
    root_vocab = defaultdict(lambda: len(root_vocab))
    synset_vocab = defaultdict(lambda: len(synset_vocab))

    token_vocab['<UNK>'] = 0
    pos_vocab['<UNK>'] = 0
    deprel_vocab['<UNK>'] = 0
    root_vocab['<UNK>'] = 0
    synset_vocab['<UNK>'] = 0

    for sentence in data['sentences']:
        for annotation in sentence['annotation']:
            # Token text
            token_vocab[annotation['text']]

            # POS tags
            pos_vocab[annotation['pos']]
            pos_vocab[annotation['upos']]

            # Dependency relations
            deprel_vocab[annotation['deprel']]

            # Roots from word analysis
            for word_analysis in annotation['word_analysis']:
                root_vocab[word_analysis['root']]

                # Synsets
                for synset_item in word_analysis['synset_data']:
                    synset_vocab[synset_item['synset']]

    return (dict(token_vocab), dict(pos_vocab), dict(deprel_vocab),
            dict(root_vocab), dict(synset_vocab))


In [12]:

token_vocab, pos_vocab, deprel_vocab, root_vocab, synset_vocab = build_vocabularies(data)

In [14]:
token_emb_dim = 256
pos_emb_dim = 32
deprel_emb_dim = 16
root_emb_dim = 64
synset_emb_dim = 64

token_embeddings = nn.Embedding(len(token_vocab), token_emb_dim)
pos_embeddings = nn.Embedding(len(pos_vocab), pos_emb_dim)
deprel_embeddings = nn.Embedding(len(deprel_vocab), deprel_emb_dim)
root_embeddings = nn.Embedding(len(root_vocab), root_emb_dim)
synset_embeddings = nn.Embedding(len(synset_vocab), synset_emb_dim)

In [15]:
def create_sentence_embeddings(sentence_annotation, vocab_dicts):
    token_vocab, pos_vocab, deprel_vocab, root_vocab, synset_vocab = vocab_dicts

    all_token_embs = []
    all_pos_embs = []
    all_deprel_embs = []
    all_root_embs = []
    all_synset_embs = []

    for annotation in sentence_annotation:
        token_idx = token_vocab.get(annotation['text'], 0)  # 0 is UNK
        token_emb = token_embeddings(torch.tensor(token_idx))
        all_token_embs.append(token_emb)

        pos_idx = pos_vocab.get(annotation['pos'], 0)
        upos_idx = pos_vocab.get(annotation['upos'], 0)
        pos_emb = pos_embeddings(torch.tensor(pos_idx))
        upos_emb = pos_embeddings(torch.tensor(upos_idx))
        all_pos_embs.extend([pos_emb, upos_emb])

        deprel_idx = deprel_vocab.get(annotation['deprel'], 0)
        deprel_emb = deprel_embeddings(torch.tensor(deprel_idx))
        all_deprel_embs.append(deprel_emb)

        for word_analysis in annotation['word_analysis']:
            root_idx = root_vocab.get(word_analysis['root'], 0)
            root_emb = root_embeddings(torch.tensor(root_idx))
            all_root_embs.append(root_emb)

            for synset_item in word_analysis['synset_data']:
                synset_idx = synset_vocab.get(synset_item['synset'], 0)
                synset_emb = synset_embeddings(torch.tensor(synset_idx))
                all_synset_embs.append(synset_emb)

    token_embs_stack = torch.stack(all_token_embs) if all_token_embs else torch.tensor([])
    pos_embs_stack = torch.stack(all_pos_embs) if all_pos_embs else torch.tensor([])
    deprel_embs_stack = torch.stack(all_deprel_embs) if all_deprel_embs else torch.tensor([])
    root_embs_stack = torch.stack(all_root_embs) if all_root_embs else torch.tensor([])
    synset_embs_stack = torch.stack(all_synset_embs) if all_synset_embs else torch.tensor([])

    return {
        'token_embeddings': token_embs_stack,
        'pos_embeddings': pos_embs_stack,
        'deprel_embeddings': deprel_embs_stack,
        'root_embeddings': root_embs_stack,
        'synset_embeddings': synset_embs_stack
    }

In [16]:
vocab_dicts = (token_vocab, pos_vocab, deprel_vocab, root_vocab, synset_vocab)

In [17]:
sentence_embeddings = []
for i, sentence in enumerate(data['sentences']):
    print(f"Processing sentence {i+1}: {sentence['original_sentence']}")
    emb_dict = create_sentence_embeddings(sentence['annotation'], vocab_dicts)
    sentence_embeddings.append(emb_dict)

Processing sentence 1: गच्छति ग्रामम्।
Processing sentence 2: धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।
Processing sentence 3: मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय।


In [21]:
import torch
import torch.nn as nn
import json
from collections import defaultdict

with open('sanskrit_nlp_analysis.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [22]:


def build_vocabularies(data):
    token_vocab = defaultdict(lambda: len(token_vocab))
    pos_vocab = defaultdict(lambda: len(pos_vocab))
    upos_vocab = defaultdict(lambda: len(upos_vocab))
    deprel_vocab = defaultdict(lambda: len(deprel_vocab))
    root_vocab = defaultdict(lambda: len(root_vocab))
    synset_vocab = defaultdict(lambda: len(synset_vocab))
    feats_vocab = defaultdict(lambda: len(feats_vocab))
    category_vocab = defaultdict(lambda: len(category_vocab))
    lemma_vocab = defaultdict(lambda: len(lemma_vocab))

    token_vocab['<UNK>'] = 0
    pos_vocab['<UNK>'] = 0
    upos_vocab['<UNK>'] = 0
    deprel_vocab['<UNK>'] = 0
    root_vocab['<UNK>'] = 0
    synset_vocab['<UNK>'] = 0
    feats_vocab['<UNK>'] = 0
    category_vocab['<UNK>'] = 0
    lemma_vocab['<UNK>'] = 0

    for sentence in data['sentences']:
        for annotation in sentence['annotation']:
            if annotation['text']:
                token_vocab[annotation['text']]
            if annotation['pos']:
                pos_vocab[annotation['pos']]
            if annotation['upos']:
                upos_vocab[annotation['upos']]
            if annotation['deprel']:
                deprel_vocab[annotation['deprel']]
            if annotation['feats']:
                feats_vocab[annotation['feats']]

            if annotation['word_analysis']:
                for word_analysis in annotation['word_analysis']:
                    if word_analysis['root']:
                        root_vocab[word_analysis['root']]
                    if word_analysis['category']:
                        for category in word_analysis['category']:
                            category_vocab[category]
                    if word_analysis['synset_data']:
                        for synset_item in word_analysis['synset_data']:
                            if synset_item['synset']:
                                synset_vocab[synset_item['synset']]
                            if synset_item['lemma_names']:
                                for lemma in synset_item['lemma_names']:
                                    lemma_vocab[lemma]

    return (dict(token_vocab), dict(pos_vocab), dict(upos_vocab),
            dict(deprel_vocab), dict(root_vocab), dict(synset_vocab),
            dict(feats_vocab), dict(category_vocab), dict(lemma_vocab))


Vocabulary Sizes:
Token vocab: 11
POS vocab: 4
UPOS vocab: 4
Deprel vocab: 4
Root vocab: 13
Synset vocab: 8
Feats vocab: 5
Category vocab: 7
Lemma vocab: 69


In [23]:

vocab_dicts = build_vocabularies(data)
token_vocab, pos_vocab, upos_vocab, deprel_vocab, root_vocab, synset_vocab, feats_vocab, category_vocab, lemma_vocab = vocab_dicts

token_emb_dim = 256
pos_emb_dim = 32
upos_emb_dim = 32
deprel_emb_dim = 16
root_emb_dim = 64
synset_emb_dim = 64
feats_emb_dim = 32
category_emb_dim = 32
lemma_emb_dim = 64


In [25]:

token_embeddings = nn.Embedding(len(token_vocab), token_emb_dim)
pos_embeddings = nn.Embedding(len(pos_vocab), pos_emb_dim)
upos_embeddings = nn.Embedding(len(upos_vocab), upos_emb_dim)
deprel_embeddings = nn.Embedding(len(deprel_vocab), deprel_emb_dim)
root_embeddings = nn.Embedding(len(root_vocab), root_emb_dim)
synset_embeddings = nn.Embedding(len(synset_vocab), synset_emb_dim)
feats_embeddings = nn.Embedding(len(feats_vocab), feats_emb_dim)
category_embeddings = nn.Embedding(len(category_vocab), category_emb_dim)
lemma_embeddings = nn.Embedding(len(lemma_vocab), lemma_emb_dim)


In [28]:

def create_token_embeddings(annotation, vocab_dicts):
    (token_vocab, pos_vocab, upos_vocab, deprel_vocab, root_vocab,
     synset_vocab, feats_vocab, category_vocab, lemma_vocab) = vocab_dicts

    token_idx = token_vocab.get(annotation['text'], 0) if annotation['text'] else 0
    pos_idx = pos_vocab.get(annotation['pos'], 0) if annotation['pos'] else 0
    upos_idx = upos_vocab.get(annotation['upos'], 0) if annotation['upos'] else 0
    deprel_idx = deprel_vocab.get(annotation['deprel'], 0) if annotation['deprel'] else 0
    feats_idx = feats_vocab.get(annotation['feats'], 0) if annotation['feats'] else 0

    token_emb = token_embeddings(torch.tensor(token_idx))
    pos_emb = pos_embeddings(torch.tensor(pos_idx))
    upos_emb = upos_embeddings(torch.tensor(upos_idx))
    deprel_emb = deprel_embeddings(torch.tensor(deprel_idx))
    feats_emb = feats_embeddings(torch.tensor(feats_idx))

    root_embs = []
    category_embs = []
    synset_embs = []
    lemma_embs = []

    if annotation['word_analysis']:
        for word_analysis in annotation['word_analysis']:
            if word_analysis['root']:
                root_idx = root_vocab.get(word_analysis['root'], 0)
                root_emb = root_embeddings(torch.tensor(root_idx))
                root_embs.append(root_emb)

            if word_analysis['category']:
                for category in word_analysis['category']:
                    category_idx = category_vocab.get(category, 0)
                    category_emb = category_embeddings(torch.tensor(category_idx))
                    category_embs.append(category_emb)

            if word_analysis['synset_data']:
                for synset_item in word_analysis['synset_data']:
                    if synset_item['synset']:
                        synset_idx = synset_vocab.get(synset_item['synset'], 0)
                        synset_emb = synset_embeddings(torch.tensor(synset_idx))
                        synset_embs.append(synset_emb)

                    if synset_item['lemma_names']:
                        for lemma in synset_item['lemma_names']:
                            lemma_idx = lemma_vocab.get(lemma, 0)
                            lemma_emb = lemma_embeddings(torch.tensor(lemma_idx))
                            lemma_embs.append(lemma_emb)

    avg_root_emb = torch.mean(torch.stack(root_embs), dim=0) if root_embs else torch.zeros(root_emb_dim)
    avg_category_emb = torch.mean(torch.stack(category_embs), dim=0) if category_embs else torch.zeros(category_emb_dim)
    avg_synset_emb = torch.mean(torch.stack(synset_embs), dim=0) if synset_embs else torch.zeros(synset_emb_dim)
    avg_lemma_emb = torch.mean(torch.stack(lemma_embs), dim=0) if lemma_embs else torch.zeros(lemma_emb_dim)

    return {
        'text': annotation['text'],
        'token_emb': token_emb,
        'pos_emb': pos_emb,
        'upos_emb': upos_emb,
        'deprel_emb': deprel_emb,
        'feats_emb': feats_emb,
        'root_emb': avg_root_emb,
        'category_emb': avg_category_emb,
        'synset_emb': avg_synset_emb,
        'lemma_emb': avg_lemma_emb
    }



 processed sentence 1: 'गच्छति ग्रामम्।'
['स', 'ञ्', 'जय', '।']

 processed sentence 2: 'धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः।'
['स', 'ञ्', 'जय', '।']

 processed sentence 3: 'मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय।'
['स', 'ञ्', 'जय', '।']
[]


In [40]:

sentence_embeddings_data = []

for i, sentence in enumerate(data['sentences']):
    sentence_tokens = []
    for j, annotation in enumerate(sentence['annotation']):
        token_emb_dict = create_token_embeddings(annotation, vocab_dicts)
        sentence_tokens.append(token_emb_dict)

    sentence_embeddings_data.append({
        'original_sentence': sentence['original_sentence'],
        'token_embeddings': sentence_tokens
    })

print(f"Created embeddings for {len(sentence_embeddings_data)} sentences")
print(f"First sentence tokens: {[token['text'] for token in sentence_embeddings_data[0]['token_embeddings']]}")
print(sentence_embeddings_data[0]['token_embeddings'][1])


Created embeddings for 3 sentences
First sentence tokens: ['गच्छति', 'ग्रामम्।']
{'text': 'ग्रामम्।', 'token_emb': tensor([-1.5419,  1.2567, -0.1201, -0.7505,  0.1122,  1.2251,  1.5011,  2.7073,
        -1.0551,  0.3698,  0.3526,  0.4196, -2.4235,  0.9096,  0.0746, -0.0574,
         0.7172, -1.4922,  0.7821,  0.1955,  0.6878, -1.3480,  0.2004, -2.0693,
         1.0934,  0.2808, -1.1844,  0.4219,  0.0847,  0.9155, -1.1510, -0.7705,
         0.1779,  0.2910, -0.0265,  0.0880,  0.2620, -2.1098,  0.6813, -1.5978,
         0.4127, -1.2584, -1.2487,  0.4738, -1.2487,  1.8917, -0.1917, -1.0562,
         0.1621, -0.7743,  1.5597,  1.7982, -0.4630, -0.7597,  1.0255, -1.1869,
         0.8289,  1.0214, -0.0325, -0.3919,  2.4126, -0.4691, -0.0146, -0.4110,
         0.7154, -1.4378,  0.3520,  1.2575,  0.6033,  1.2389,  0.7635, -1.6764,
         1.3035, -1.6167, -0.5825,  1.5785,  0.1532,  1.4402, -0.4596, -0.3535,
         1.3310,  1.3627, -1.4788,  0.1574,  1.8643,  0.0310, -1.2977, -1.8781,
     

In [34]:
import json
import math
import numpy as np

def get_positional_encoding(max_length, d_model):

    position = np.arange(max_length)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))

    pos_encoding = np.zeros((max_length, d_model))
    pos_encoding[:, 0::2] = np.sin(position * div_term)
    pos_encoding[:, 1::2] = np.cos(position * div_term)

    return pos_encoding


In [35]:

def create_sentence_positional_embeddings(sentence_data, d_model=512):

    words = [annotation["text"].replace("।", "") for annotation in sentence_data["annotation"]]
    max_length = len(words)

    pos_encoding = get_positional_encoding(max_length, d_model)

    word_positions = []
    for i, word in enumerate(words):
        word_positions.append({
            "word": word,
            "position": i,
            "embedding": pos_encoding[i].tolist()
        })

    return word_positions


In [36]:

def process_sanskrit_json(json_file_path, d_model=512):

    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    results = []

    for sentence in data["sentences"]:
        original_sentence = sentence["original_sentence"]
        positional_embeddings = create_sentence_positional_embeddings(sentence, d_model)

        results.append({
            "original_sentence": original_sentence,
            "positional_embeddings": positional_embeddings
        })

    return results


In [37]:

if __name__ == "__main__":
    json_file_path = "sanskrit_nlp_analysis.json"

    embeddings = process_sanskrit_json(json_file_path, d_model=128)

    for i, result in enumerate(embeddings):
        for word_embedding in result["positional_embeddings"]:
            print(f"  {word_embedding['word']} (position {word_embedding['position']}): "
                  f"embedding length = {len(word_embedding['embedding'])}")

    output_data = {
        "positional_embeddings": embeddings
    }

    with open("sanskrit_positional_embeddings.json", "w", encoding="utf-8") as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)



  गच्छति (position 0): embedding length = 128
  ग्रामम् (position 1): embedding length = 128
  धर्मक्षेत्रे (position 0): embedding length = 128
  कुरुक्षेत्रे (position 1): embedding length = 128
  समवेता (position 2): embedding length = 128
  युयुत्सवः (position 3): embedding length = 128
  मामकाः (position 0): embedding length = 128
  पाण्डवाश्चैव (position 1): embedding length = 128
  किमकुर्वत (position 2): embedding length = 128
  सञ्जय (position 3): embedding length = 128
