In [2]:
with open('vedic_corpus_tagged.txt', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 10: break
        print(line.strip())

<BOS> पूर्वस्य<id=1|form=pūrvasya|lemma=pūrva|upos=PRON|xpos=_|feats=Case=Gen|Gender=Masc|Number=Sing|head=2|deprel=orphan|deps=_|misc=LemmaId=145625|OccId=4495389|Unsandhied=pūrvasya|Annotator=Sv|unsandhied=pūrvasya|annotator=Sv|lemmaid=145625|occid=4495389|ismantra=_|punctuation=_> मेधाजननानि<id=2|form=medhājananāni|lemma=medhājanana|upos=NOUN|xpos=_|feats=Case=Nom|Gender=Neut|Number=Plur|head=0|deprel=root|deps=_|misc=LemmaId=72086|OccId=4495390|Unsandhied=medhājananāni|Annotator=Sv|Punctuation=fullStop|unsandhied=medhājananāni|annotator=Sv|lemmaid=72086|occid=4495390|ismantra=_|punctuation=fullStop> <EOS>
<BOS> शुक<id=1|form=śuka|lemma=śuka|upos=NOUN|xpos=_|feats=Compound=Yes|head=3|deprel=compound:coord|deps=_|misc=LemmaId=126235|OccId=4495391|Unsandhied=śuka|Annotator=Au|unsandhied=śuka|annotator=Au|lemmaid=126235|occid=4495391|ismantra=_|punctuation=_> सारि<id=2|form=sāri|lemma=sāri|upos=NOUN|xpos=_|feats=Compound=Yes|head=3|deprel=compound:coord|deps=_|misc=LemmaId=113121|OccId

In [3]:
import re

def clean_sanskrit_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []

    for line in lines:
        cleaned_line = re.sub(r'<id=.*?>', '', line)
        cleaned_line = cleaned_line.replace('<BOS>', '').replace('<EOS>', '')
        cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()

        if cleaned_line:
            cleaned_lines.append(cleaned_line + '\n')

    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(cleaned_lines)

    print(f"Cleaning complete! {len(cleaned_lines)} lines processed.")
    print("Sample cleaned lines:")
    for i in range(min(3, len(cleaned_lines))):
        print(f"{i+1}: {cleaned_lines[i].strip()}")



In [4]:
def clean_sanskrit_data_v2(input_file, output_file):

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    pattern = r'<BOS>\s*((?:\S+?<id=.*?>\s*)+)\s*<EOS>'
    sentences = re.findall(pattern, content)

    cleaned_sentences = []

    for sentence in sentences:
        clean_sentence = re.sub(r'<id=.*?>', '', sentence)
        clean_sentence = re.sub(r'\s+', ' ', clean_sentence).strip()
        cleaned_sentences.append(clean_sentence + '\n')

    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(cleaned_sentences)

    print(f"Cleaning complete! {len(cleaned_sentences)} sentences processed.")
    return cleaned_sentences


In [5]:
def clean_with_preserved_info(input_file, output_file):

    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []

    for line in lines:
        words = re.findall(r'(\S+?)<id=.*?\|form=([^|]+)\|lemma=[^|]+\|upos=([^|]+)\|[^>]*>', line)

        if words:
            clean_sentence = ' '.join([f"{word[1]}_{word[2]}" for word in words])
            cleaned_lines.append(clean_sentence + '\n')

    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(cleaned_lines)

    print(f"Cleaning with POS tags complete! {len(cleaned_lines)} lines processed.")
    return cleaned_lines

In [7]:
if __name__ == "__main__":
    input_file = "vedic_corpus_tagged.txt"
    output_file = "cleaned_dataset.txt"

    clean_sanskrit_data(input_file, output_file)

Cleaning complete! 31264 lines processed.
Sample cleaned lines:
1: पूर्वस्य मेधाजननानि
2: शुक सारि कृशानाम् जिह्वाः बध्नाति
3: आशयति


In [8]:
import re

def remove_tags(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []

    for line in lines:
        line = re.sub(r'id=[^|]+\|', '', line)
        line = re.sub(r'misc=LemmaId=[^|]+\|', 'misc=', line)
        line = re.sub(r'OccId=[^|]+\|', '', line)
        line = re.sub(r'lemmaid=[^|]+\|', '', line)
        line = re.sub(r'occid=[^|]+\|', '', line)

        cleaned_lines.append(line)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(cleaned_lines)

    print("Tags removed successfully!")

remove_tags("vedic_corpus_tagged.txt", "cleaned_dataset_tagged.txt")

Tags removed successfully!


In [10]:
# !pip install indic-transliteration

In [11]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import re

def convert_latin_to_devanagari(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    def convert_match(match):
        latin_text = match.group(1)
        devanagari_text = transliterate(latin_text, sanscript.IAST, sanscript.DEVANAGARI)
        return devanagari_text

    content = re.sub(r'form=([^|]+)', lambda m: f'form={convert_match(m)}', content)
    content = re.sub(r'lemma=([^|]+)', lambda m: f'lemma={convert_match(m)}', content)
    content = re.sub(r'Unsandhied=([^|]+)', lambda m: f'Unsandhied={convert_match(m)}', content)
    content = re.sub(r'unsandhied=([^|]+)', lambda m: f'unsandhied={convert_match(m)}', content)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(content)

    print("Conversion complete!")



In [13]:
convert_latin_to_devanagari("cleaned_dataset_tagged.txt", "output_devanagari_tagged.txt")

Conversion complete!


In [1]:
import re
import json
from transformers import AutoTokenizer


In [2]:
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
TAG_PATTERN = re.compile(r'(\S+)<form=(.*?)\|(.*?)>')


In [4]:
def parse_key_value_block(block):
    tags = {}
    parts = block.split("|")
    for p in parts:
        if "=" in p:
            k, v = p.split("=", 1)
            tags[k.lower()] = "" if v == "_" else v
    return tags

In [5]:

def parse_token(token_with_tag):

    match = TAG_PATTERN.search(token_with_tag)
    if not match:
        return None

    surface_word = match.group(1)
    form = match.group(2)
    rest = match.group(3)

    fields = rest.split("|")

    tags = {
        "lemma": "",
        "upos": "",
        "xpos": "",
        "head": "",
        "deprel": "",
        "deps": "",
        "misc": "",
        "feats": {}
    }

    for field in fields:
        if "=" not in field:
            continue
        key, value = field.split("=", 1)
        value = "" if value == "_" else value

        key = key.lower()

        if key == "feats":
            tags["feats"] = parse_key_value_block(value)
        else:
            tags[key] = value

    return surface_word, tags


In [6]:
def process_sentence(line):

    line = line.replace("<BOS>", "").replace("<EOS>", "").strip()

    tokens = []
    for token in line.split():
        parsed = parse_token(token)
        if parsed is None:
            continue
        word, tag_dict = parsed

        # tokenize into subwords
        subwords = tokenizer.tokenize(word)

        tokens.append({
            "word": word,
            "subwords": subwords,
            "tags": {
                "lemma": tag_dict["lemma"],
                "upos": tag_dict["upos"],
                "xpos": tag_dict["xpos"],
                "head": tag_dict["head"],
                "deprel": tag_dict["deprel"],
                "deps": tag_dict["deps"],
                "misc": tag_dict["misc"],
                "feats": tag_dict["feats"]
            }
        })

    return tokens


In [8]:
input_file = "output_devanagari_tagged.txt"
output_file = "final_tagged.jsonl"

with open(input_file, "r", encoding="utf-8") as fin, \
     open(output_file, "w", encoding="utf-8") as fout:

    for line in fin:
        line = line.strip()
        if not line:
            continue

        sentence_data = process_sentence(line)
        fout.write(json.dumps(sentence_data, ensure_ascii=False) + "\n")

print(output_file)

final_tagged.jsonl




> Embedding Final

    x₀[i] = SUM (
            subword_emb[i],
            main_word_emb,
            pos_emb,
            case_emb,
            gender_emb,
            number_emb,
            deprel_emb,
            lemma_emb,
            positional_emb[i],
            unsandhi_emb
        )

In [36]:
import json
import sentencepiece as spm
import numpy as np
from collections import defaultdict

with open('Data.jsonl', 'r', encoding='utf-8') as f:
    lines = f.readlines()

In [37]:
text_data = []
for line in lines:
    data = json.loads(line)
    for item in data:
        text_data.append(item['word'])
        text_data.extend(item['subwords'])
        text_data.append(item['tags']['lemma'])

In [38]:
with open('sanskrit_text.txt', 'w', encoding='utf-8') as f:
    for text in text_data:
        f.write(text + '\n')

spm.SentencePieceTrainer.train(
    input='sanskrit_text.txt',
    model_prefix='sanskrit_sp',
    vocab_size=100,
    character_coverage=1.0,
    model_type='bpe'
)

In [39]:
sp = spm.SentencePieceProcessor()
sp.load('sanskrit_sp.model')


True

In [40]:
emb_dim = 256
vocab_size = sp.get_piece_size()

In [41]:
subword_emb = np.random.randn(vocab_size, emb_dim) * 0.1
main_word_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
pos_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
xpos_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
head_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
deprel_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
deps_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
misc_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
lemma_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
case_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
gender_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
number_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
mood_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)
compound_emb = defaultdict(lambda: np.random.randn(emb_dim) * 0.1)


In [42]:
all_embeddings = []
reconstructed_data = []


In [43]:
for line_idx, line in enumerate(lines):
    data = json.loads(line)
    sentence_embeddings = []
    sentence_reconstructed = []

    for i, item in enumerate(data):
        word = item['word']
        tags = item['tags']
        feats = tags.get('feats', {})

        subword_ids = [sp.piece_to_id(sw) for sw in item['subwords']]
        lemma_ids = [sp.piece_to_id(lem) for lem in [tags.get('lemma', 'UNK')]]

        pos = tags.get('upos', 'UNK')
        xpos = tags.get('xpos', 'UNK')
        head = tags.get('head', 'UNK')
        deprel = tags.get('deprel', 'UNK')
        deps = tags.get('deps', 'UNK')
        misc = tags.get('misc', 'UNK')
        lemma = tags.get('lemma', 'UNK')

        case = feats.get('case', 'UNK')
        gender = feats.get('gender', 'UNK')
        number = feats.get('number', 'UNK')
        mood = feats.get('mood', 'UNK')
        compound = feats.get('compound', 'UNK')

        word_embedding = np.zeros(emb_dim)

        for j, subword_id in enumerate(subword_ids):
            position = i + j
            d_model = emb_dim
            angle_rates = 1 / np.power(10000, (2 * (np.arange(d_model) // 2)) / np.float32(d_model))
            angle_rads = position * angle_rates
            angle_rads[0::2] = np.sin(angle_rads[0::2])
            angle_rads[1::2] = np.cos(angle_rads[1::2])
            v15 = angle_rads

            v1 = subword_emb[subword_id]
            v2 = main_word_emb[word]
            v3 = pos_emb[pos]
            v4 = xpos_emb[xpos]
            v5 = head_emb[head]
            v6 = deprel_emb[deprel]
            v7 = deps_emb[deps]
            v8 = misc_emb[misc]
            v9 = lemma_emb[lemma]
            v10 = case_emb[case]
            v11 = gender_emb[gender]
            v12 = number_emb[number]
            v13 = mood_emb[mood]
            v14 = compound_emb[compound]

            final_emb = v1 + v2 + v3 + v4 + v5 + v6 + v7 + v8 + v9 + v10 + v11 + v12 + v13 + v14 + v15
            word_embedding += final_emb

        for lemma_id in lemma_ids:
            v_lemma = subword_emb[lemma_id]
            word_embedding += v_lemma

        word_embedding /= (len(subword_ids) + len(lemma_ids))
        sentence_embeddings.append(word_embedding)

        reconstructed_word = {
            'original_word': word,
            'subwords': item['subwords'],
            'pos': pos,
            'xpos': xpos,
            'head': head,
            'deprel': deprel,
            'deps': deps,
            'misc': misc,
            'lemma': lemma,
            'case': case,
            'gender': gender,
            'number': number,
            'mood': mood,
            'compound': compound,
            'embedding': word_embedding.tolist()
        }
        sentence_reconstructed.append(reconstructed_word)

    all_embeddings.append(sentence_embeddings)
    reconstructed_data.append(sentence_reconstructed)

In [44]:
with open('reconstructed_data.json', 'w', encoding='utf-8') as f:
    json.dump(reconstructed_data, f, ensure_ascii=False, indent=2)


In [45]:
embeddings_array = []
for sentence in all_embeddings:
    for word_emb in sentence:
        embeddings_array.append(word_emb)


In [47]:
embeddings_array = np.array(embeddings_array)
print(embeddings_array)
np.save('sanskrit_embeddings.npy', embeddings_array)

print(f"Created embeddings shape: {embeddings_array.shape}")
print(f"Processed {len(reconstructed_data)} sentences")
print(f"Vocabulary size: {vocab_size}")

[[ 0.20195313  0.32794686  0.25259537 ...  0.49416306  0.13394194
   0.74348156]
 [ 0.00341537 -0.09511101 -0.13156451 ...  0.19175072 -0.09728796
   1.16271199]
 [ 0.63429399  0.34739523  0.37364202 ...  0.40740711 -0.16126068
   0.91898578]
 ...
 [-0.06784051 -0.45031746 -0.10289693 ...  0.26853123  0.01377409
   0.99812977]
 [-0.38531976  0.05354616 -0.5236845  ...  0.66276339  0.13288216
   1.39590881]
 [-0.13415877  0.14323997 -0.51944295 ...  0.40227675  0.16255192
   0.85157981]]
Created embeddings shape: (7, 256)
Processed 2 sentences
Vocabulary size: 100
