In [158]:
# !pip install requests pyiwn
# !pip install sanskritmorph
# !pip install sanskrit-lexicon
import requests
import pyiwn
import re

In [159]:
import pyiwn
import re
# from sanskritmorph import Analyzer

iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.SANSKRIT)

In [160]:
def get_root_and_pos(word):
    try:
        synsets = iwn.synsets(word)
        for synset in synsets:
            lemmas = synset.lemmas()
            if lemmas:
                root = lemmas[0].name()
                pos = str(synset.pos())
                if pos != 'None':
                    return root, pos
        return word, 'UNKNOWN'
    except:
        return word, 'UNKNOWN'

In [161]:
def get_root_from_iwn(word):
    try:
        synsets = iwn.synsets(word)
        for synset in synsets:
            lemmas = synset.lemmas()
            if lemmas:
                return lemmas[0].name()
        return 'unknown'
    except:
        return 'unknown'

In [162]:
def process_word(word):
    if word in ['।', '॥']:
        return {'text': word, 'root': 'punctuation', 'category': 'PUNC'}

    clean_word = re.sub(r'[।॥.,!?;:]', '', word.strip())
    if not clean_word:
        return {'text': word, 'root': 'punctuation', 'category': 'PUNC'}

    root, pos = get_root_and_pos(clean_word)

    return {
        'text': word,
        'root': root,
        'category': pos
    }

In [163]:
def tag_text(text):
    words = re.findall(r'[\u0900-\u097F]+[।॥]?|[।॥]', text)
    tagged_words = []

    for word in words:
        analysis = process_word(word)
        tagged_words.append(
            f"{analysis['text']}<root={analysis['root']}|pos={analysis['category']}>"
        )

    return ' '.join(tagged_words)

In [164]:
def process_file(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        with open(output_file, 'w', encoding='utf-8') as f:
            for line in lines:
                if line.strip():
                    tagged_line = tag_text(line.strip())
                    f.write(f"{tagged_line}\n")
                else:
                    f.write('\n')

    except Exception as e:
        print(f"Error: {e}")

In [165]:
test_words = ["आत्मा", "सर्वत्र", "व्याप्यते", "मनः"]
for word in test_words:
    result = process_word(word)
    print(f"{word}: root={result['root']}, pos={result['category']}")

आत्मा: root=आत्मा-उपनिषद्, pos=noun
सर्वत्र: root=प्रतिस्थानम्, pos=adverb
व्याप्यते: root=व्याप्यते, pos=UNKNOWN
मनः: root=मतम्, pos=noun


In [166]:
process_file('data.txt', 'sanskrit_pos_output.txt')
# !pip install sanskrit_parser
# !pip install git+https://github.com/avinashvarna/sanskrit_parser.git

In [167]:
import pyiwn
import re
from sanskrit_parser import Parser

iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.SANSKRIT)
parser = Parser()

In [168]:
def get_root_and_pos_with_parser(word):
    try:
        result = parser.parse(word)
        if result and len(result) > 0:
            analysis = result[0]
            root = analysis.get_root()
            pos = analysis.get_pos()
            if root and pos:
                return root, pos
    except:
        pass
    return word, 'UNKNOWN'

In [169]:
def get_root_and_pos_with_iwn(word):
    try:
        synsets = iwn.synsets(word)
        for synset in synsets:
            lemmas = synset.lemmas()
            if lemmas:
                root = lemmas[0].name()
                pos = str(synset.pos())
                if pos != 'None':
                    return root, pos
        return word, 'UNKNOWN'
    except:
        return word, 'UNKNOWN'

In [170]:
def process_word(word):
    if word in ['।', '॥']:
        return {'text': word, 'root': 'punctuation', 'category': 'PUNC'}

    clean_word = re.sub(r'[।॥.,!?;:]', '', word.strip())
    if not clean_word:
        return {'text': word, 'root': 'punctuation', 'category': 'PUNC'}

    root, pos = get_root_and_pos_with_parser(clean_word)

    if pos == 'UNKNOWN':
        root, pos = get_root_and_pos_with_iwn(clean_word)

    return {
        'text': word,
        'root': root,
        'category': pos
    }

In [171]:
def tag_text(text):
    words = re.findall(r'[\u0900-\u097F]+[।॥]?|[।॥]', text)
    tagged_words = []

    for word in words:
        analysis = process_word(word)
        tagged_words.append(
            f"{analysis['text']}<root={analysis['root']}|pos={analysis['category']}>"
        )

    return ' '.join(tagged_words)

In [172]:
def process_file(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        with open(output_file, 'w', encoding='utf-8') as f:
            for line in lines:
                if line.strip():
                    tagged_line = tag_text(line.strip())
                    f.write(f"{tagged_line}\n")
                else:
                    f.write('\n')

    except Exception as e:
        print(f"Error: {e}")

In [173]:
test_words = ["व्याप्यते", "करोति", "भवति", "गच्छति"]
for word in test_words:
    result = process_word(word)
    print(f"{word}: root={result['root']}, pos={result['category']}")

व्याप्यते: root=व्याप्यते, pos=UNKNOWN
करोति: root=करोति, pos=UNKNOWN
भवति: root=भवति, pos=UNKNOWN
गच्छति: root=गच्छति, pos=UNKNOWN


In [174]:
# !pip install sanskrit-data
# !pip install sanskrit-util

In [175]:
def get_root_and_pos(word):
    try:
        synsets = iwn.synsets(word)
        for synset in synsets:
            lemmas = synset.lemmas()
            if lemmas:
                root = lemmas[0].name()
                pos = str(synset.pos())
                if pos != 'None':
                    return root, pos
        return word, 'UNKNOWN'
    except:
        return word, 'UNKNOWN'

In [176]:
def process_word(word):
    if word in ['।', '॥']:
        return {'text': word, 'root': 'punctuation', 'category': 'PUNC'}

    clean_word = re.sub(r'[।॥.,!?;:]', '', word.strip())
    if not clean_word:
        return {'text': word, 'root': 'punctuation', 'category': 'PUNC'}

    root, pos = get_root_and_pos(clean_word)

    return {
        'text': word,
        'root': root,
        'category': pos
    }

In [177]:
def tag_text(text):
    words = re.findall(r'[\u0900-\u097F]+[।॥]?|[।॥]', text)
    tagged_words = []

    for word in words:
        analysis = process_word(word)
        tagged_words.append(
            f"{analysis['text']}<root={analysis['root']}|pos={analysis['category']}>"
        )

    return ' '.join(tagged_words)

In [178]:
def process_file(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        with open(output_file, 'w', encoding='utf-8') as f:
            for line in lines:
                if line.strip():
                    tagged_line = tag_text(line.strip())
                    f.write(f"{tagged_line}\n")
                else:
                    f.write('\n')

    except Exception as e:
        print(f"Error: {e}")

In [179]:
process_file('data.txt', 'sanskrit_output.txt')

In [180]:
# !wget https://www.sanskritlibrary.org/downloads/tagged_corpus.zip

# !git clone https://github.com/UniversalDependencies/UD_Sanskrit-UFAL

# !pip install sanskrit-data

In [181]:
# UoH Sanskrit Treebank example
def load_ud_sanskrit():
    """Load Universal Dependencies Sanskrit data"""
    try:
        with open('UD_Sanskrit-UFAL/sa_ufal-ud-train.conllu', 'r', encoding='utf-8') as f:
            return f.read()
    except:
        return "Dataset not found"

# Sample of what you'll get:
"""
1	रामः	राम	NOUN	_	Case=Nom|Gender=Masc|Number=Sing	0	_	_	_
2	वनम्	वन	NOUN	_	Case=Acc|Gender=Neut|Number=Sing	3	obj	_	_
3	गच्छति	गम्	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|Voice=Act	0	_	_	_
"""

'\n1\tरामः\tराम\tNOUN\t_\tCase=Nom|Gender=Masc|Number=Sing\t0\t_\t_\t_\n2\tवनम्\tवन\tNOUN\t_\tCase=Acc|Gender=Neut|Number=Sing\t3\tobj\t_\t_\n3\tगच्छति\tगम्\tVERB\t_\tMood=Ind|Number=Sing|Person=3|Tense=Pres|Voice=Act\t0\t_\t_\t_\n'

In [182]:
import requests
import json

def download_sanskrit_pos_data():
    """Download pre-tagged Sanskrit data"""
    urls = [
        "https://raw.githubusercontent.com/UniversalDependencies/UD_Sanskrit-UFAL/master/sa_ufal-ud-train.conllu",
        "https://raw.githubusercontent.com/UniversalDependencies/UD_Sanskrit-Vedic/master/sa_vedic-ud-test.conllu"
    ]

    for url in urls:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                filename = url.split('/')[-1]
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                print(f"Downloaded: {filename}")
        except:
            print(f"Failed: {url}")

download_sanskrit_pos_data()

Downloaded: sa_vedic-ud-test.conllu


In [183]:
import requests

def download_and_show_tagged_data():
    url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Sanskrit-UFAL/master/sa_ufal-ud-test.conllu"

    try:
        response = requests.get(url)
        if response.status_code == 200:
            lines = response.text.split('\n')
            tagged_examples = []

            for line in lines:
                if line.strip() and not line.startswith('#'):
                    parts = line.split('\t')
                    if len(parts) >= 4:
                        word = parts[1]
                        pos = parts[3]
                        lemma = parts[2]
                        tagged_examples.append(f"{word}<root={lemma}|pos={pos}>")

                        if len(tagged_examples) >= 20:
                            break

            return tagged_examples
        else:
            return ["Download failed"]
    except Exception as e:
        return [f"Error: {e}"]

tagged_data = download_and_show_tagged_data()
for item in tagged_data:
    print(item)

पञ्चतन्त्रम्<root=पञ्चतन्त्र|pos=PROPN>
कथामुखम्<root=कथामुख|pos=NOUN>
ओं<root=ओं|pos=INTJ>
नमः<root=नमस्|pos=NOUN>
श्रीशारदागणपतिगुरुभ्यः<root=_|pos=_>
श्री<root=श्री|pos=ADJ>
शारदा<root=शारदा|pos=PROPN>
गणपति<root=गणपति|pos=PROPN>
गुरुभ्यः<root=गुरु|pos=NOUN>
।<root=।|pos=PUNCT>
महाकविभ्यो<root=_|pos=_>
महा<root=महत्|pos=NOUN>
कविभ्यो<root=कवि|pos=NOUN>
नमः<root=नमस्|pos=NOUN>
।<root=।|pos=PUNCT>
ब्रह्मा<root=ब्रह्मन्|pos=PROPN>
रुद्रः<root=रुद्र|pos=PROPN>
कुमारो<root=कुमार|pos=PROPN>
हरिवरुणयमा<root=_|pos=_>
हरि<root=हरि|pos=PROPN>


In [184]:
import requests
import zipfile
import io
import os
import re

def download_and_extract_ud_dataset(lang_name, zip_url):
    print(f"\nDownloading {lang_name} dataset...")
    response = requests.get(zip_url, timeout=60)
    if response.status_code != 200:
        print(f" Failed to download {lang_name} ({response.status_code})")
        return None

    folder_name = lang_name.replace("-", "_") + "_data"
    os.makedirs(folder_name, exist_ok=True)

    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall(folder_name)
    print(f" Extracted {lang_name} to {folder_name}/")
    return folder_name



In [185]:

def parse_conllu_to_tagged(conllu_text):
    sentences = []
    current_sentence = []
    for line in conllu_text.split('\n'):
        line = line.strip()
        if not line:
            if current_sentence:
                sentences.append(' '.join(current_sentence))
                current_sentence = []
            continue
        if line.startswith("#"):
            continue
        parts = line.split('\t')
        if len(parts) >= 4:
            word, lemma, pos = parts[1], parts[2], parts[3]
            if re.search(r'[\u0900-\u097F]', word):
                current_sentence.append(f"{word}<root={lemma}|pos={pos}>")
    if current_sentence:
        sentences.append(' '.join(current_sentence))
    return sentences



In [186]:


def load_conllu_from_folder(folder_path):
    sentences = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".conllu"):
                path = os.path.join(root, file)
                try:
                    with open(path, 'r', encoding='utf-8') as f:
                        text = f.read()
                    sents = parse_conllu_to_tagged(text)
                    sentences.extend(sents)
                    print(f"Parsed {len(sents)} from {file}")
                except Exception as e:
                    print(f" Error reading {file}: {e}")
    return sentences




In [187]:

def show_corpus_stats(sentences):
    total_words = 0
    pos_distribution = {}

    for sent in sentences:
        words = sent.split()
        total_words += len(words)
        for word in words:
            m = re.search(r'\|pos=([^>]+)', word)
            if m:
                pos = m.group(1)
                pos_distribution[pos] = pos_distribution.get(pos, 0) + 1

    print(f"\n CORPUS STATISTICS:")
    print(f"Total sentences: {len(sentences):,}")
    print(f"Total words: {total_words:,}")
    print(f"Unique POS tags: {len(pos_distribution)}")

    print("\n POS Tag Distribution:")
    for pos, count in sorted(pos_distribution.items(), key=lambda x: x[1], reverse=True)[:20]:
        print(f"  {pos}: {count:,}")


    # for i, sent in enumerate(sentences[:3], 1):
    #     print(f"{i}. {sent}\n")


In [188]:



def build_large_indic_corpus():
    datasets = {
        "UD_Sanskrit-UFAL": "https://github.com/UniversalDependencies/UD_Sanskrit-UFAL/archive/refs/tags/r2.12.zip",
        "UD_Hindi-HDTB": "https://github.com/UniversalDependencies/UD_Hindi-HDTB/archive/refs/tags/r2.12.zip",
    }

    all_sentences = []
    for name, url in datasets.items():
        folder = download_and_extract_ud_dataset(name, url)
        if folder:
            sentences = load_conllu_from_folder(folder)
            all_sentences.extend(sentences)

    if not all_sentences:
        print(" No sentences found. Exiting.")
        return

    out_file = "indic_large_tagged_corpus.txt"
    with open(out_file, "w", encoding="utf-8") as f:
        for sent in all_sentences:
            f.write(sent + "\n")

    print(f"\nSUCCESS! Combined corpus saved to: {os.path.abspath(out_file)}")
    print(f"Total pre-tagged sentences: {len(all_sentences):,}\n")
    show_corpus_stats(all_sentences)


if __name__ == "__main__":
    build_large_indic_corpus()


Downloading UD_Sanskrit-UFAL dataset...
 Extracted UD_Sanskrit-UFAL to UD_Sanskrit_UFAL_data/
Parsed 230 from sa_ufal-ud-test.conllu
Parsed 1999 from parsed.conllu
Parsed 1569 from combined.conllu

Downloading UD_Hindi-HDTB dataset...
 Extracted UD_Hindi-HDTB to UD_Hindi_HDTB_data/
Parsed 1659 from hi_hdtb-ud-dev.conllu
Parsed 13306 from hi_hdtb-ud-train.conllu
Parsed 1684 from hi_hdtb-ud-test.conllu

SUCCESS! Combined corpus saved to: /content/indic_large_tagged_corpus.txt
Total pre-tagged sentences: 20,447


 CORPUS STATISTICS:
Total sentences: 20,447
Total words: 383,765
Unique POS tags: 17

 POS Tag Distribution:
  NOUN: 90,479
  ADP: 73,221
  PROPN: 43,082
  VERB: 39,581
  ADJ: 25,426
  AUX: 23,772
  PUNCT: 19,224
  PRON: 17,672
  X: 9,008
  PART: 8,856
  DET: 7,995
  CCONJ: 7,133
  SCONJ: 6,819
  NUM: 6,354
  ADV: 4,876
  _: 259
  INTJ: 8


In [189]:
import requests
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

def download_vedic_corpus():
    """Download complete Vedic Sanskrit corpus"""
    base_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Sanskrit-Vedic/master/"

    files = [
        "sa_vedic-ud-train.conllu",
        "sa_vedic-ud-test.conllu",
        "sa_vedic-ud-dev.conllu"
    ]

    all_content = ""

    for file in files:
        try:
            url = base_url + file
            print(f"Downloading: {file}")
            response = requests.get(url)
            if response.status_code == 200:
                all_content += response.text + "\n"
                print(f"Downloaded: {len(response.text):,} characters")
            else:
                print(f"Failed: {file} - Status {response.status_code}")
        except Exception as e:
            print(f"Error: {e}")

    return all_content


In [190]:

def transliterate_to_devanagari(text):
    try:
        return transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
    except Exception as e:
        print(f" Transliteration warning: {e}")
        return text


In [191]:

def parse_conllu_to_tagged_sentences(conllu_text):
    sentences = []
    current_sentence = []

    lines = conllu_text.split('\n')

    for line in lines:
        line = line.strip()

        if not line or line.startswith('#'):
            continue

        if line == '':
            if current_sentence:
                tagged_sentence = ' '.join(current_sentence)
                sentences.append(tagged_sentence)
                current_sentence = []
            continue

        parts = line.split('\t')
        if len(parts) >= 10:
            word_id = parts[0]
            form = parts[1]
            lemma = parts[2]
            upos = parts[3]
            xpos = parts[4]
            feats = parts[5]
            head = parts[6]
            deprel = parts[7]
            deps = parts[8]
            misc = parts[9]

            devanagari_form = transliterate_to_devanagari(form)

            tagged_word = (
                f"{devanagari_form}"
                f"<id={word_id}"
                f"|form={form}"
                f"|lemma={lemma}"
                f"|upos={upos}"
                f"|xpos={xpos}"
                f"|feats={feats}"
                f"|head={head}"
                f"|deprel={deprel}"
                f"|deps={deps}"
                f"|misc={misc}>"
            )
            current_sentence.append(tagged_word)

    if current_sentence:
        tagged_sentence = ' '.join(current_sentence)
        sentences.append(tagged_sentence)

    return sentences


In [192]:

def save_complete_corpus():


    conllu_text = download_vedic_corpus()

    if not conllu_text:
        print("No data downloaded")
        return

    print("Parsing and converting Sanskrit words to Devanagari...")
    sentences = parse_conllu_to_tagged_sentences(conllu_text)

    output_file = 'vedic_sanskrit_complete_annotated.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        for sent in sentences:
            f.write(sent + '\n')

    print(f"\nCOMPLETE ANNOTATED CORPUS SAVED!")
    print(f"Total sentences: {len(sentences):,}")
    print(f"File: {output_file}")

    show_corpus_statistics(sentences)


In [193]:

def show_corpus_statistics(sentences):
    """Show detailed corpus statistics"""
    total_words = 0
    pos_counts = {}

    for sent in sentences:
        words = sent.split()
        total_words += len(words)

        for word in words:
            if '|upos=' in word:
                pos_start = word.find('|upos=') + 6
                pos_end = word.find('|', pos_start)
                if pos_end == -1:
                    pos_end = word.find('>', pos_start)
                pos = word[pos_start:pos_end]
                pos_counts[pos] = pos_counts.get(pos, 0) + 1

    print(f"\nCORPUS STATISTICS:")
    print(f"Total sentences: {len(sentences):,}")
    print(f"Total words: {total_words:,}")
    print(f"Unique POS tags: {len(pos_counts)}")

    print(f"\nUniversal POS Tag Distribution:")
    for pos, count in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {pos}: {count:,}")

    # print(f"\nSample sentences with complete annotations:")

    # for i, sent in enumerate(sentences[:3], 1):
    #     print(f"{i}. {sent}")


In [194]:

def check_dependencies():
    """Check if required packages are installed"""
    try:
        import indic_transliteration
        return True
    except ImportError:
        print("Required package 'indic-transliteration' not found.")
        print("Please install it using: pip install indic-transliteration")
        return False

if __name__ == "__main__":
    if check_dependencies():
        save_complete_corpus()
    else:
        print("Please install the required dependencies and run again.")

Downloading: sa_vedic-ud-train.conllu
Downloaded: 23,071,401 characters
Downloading: sa_vedic-ud-test.conllu
Downloaded: 2,932,660 characters
Downloading: sa_vedic-ud-dev.conllu
Downloaded: 3,368,814 characters
Parsing and converting Sanskrit words to Devanagari...

COMPLETE ANNOTATED CORPUS SAVED!
Total sentences: 1
File: vedic_sanskrit_complete_annotated.txt

CORPUS STATISTICS:
Total sentences: 1
Total words: 206,440
Unique POS tags: 13

Universal POS Tag Distribution:
  NOUN: 72,315
  VERB: 39,836
  PRON: 27,825
  PART: 21,136
  ADJ: 18,315
  ADV: 13,863
  CCONJ: 4,222
  NUM: 2,900
  AUX: 1,825
  SCONJ: 1,730
  ADP: 1,383
  DET: 565
  INTJ: 525
