In [2]:
# !pip install pyiwn heritage

Collecting pyiwn
  Downloading pyiwn-0.0.5-py3-none-any.whl.metadata (778 bytes)
Collecting heritage
  Downloading heritage-0.1.1-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading pyiwn-0.0.5-py3-none-any.whl (12 kB)
Downloading heritage-0.1.1-py2.py3-none-any.whl (15 kB)
Installing collected packages: heritage, pyiwn
Successfully installed heritage-0.1.1 pyiwn-0.0.5


In [3]:
import pyiwn
from heritage import HeritagePlatform
import re

iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.SANSKRIT)
platform = HeritagePlatform()


[██████████████████████████████████████████████████]




In [4]:

def analyze_sanskrit_word(word):
    """Analyze a Sanskrit word and return its root and category"""
    try:
        clean_word = re.sub(r'[।॥.,!?;:]', '', word.strip())
        if not clean_word:
            return None

        analysis = platform.get_analysis(clean_word, sentence=False, meta=True)

        readable_info = [
            {
                'Text': word_data['text'],
                'Root': word_data.get('root'),
                'Category': word_data.get('category')
            }
            for entry in analysis.values()
            for word_list in entry['words']
            for word_data in word_list
        ]

        filtered_roots = [
            info['Root'] for info in readable_info
            if info['Root'] and not any(char.isdigit() for char in info['Root'])
        ]

        categories = [
            info['Category'] for info in readable_info
            if info['Category']
        ]

        return {
            'word': clean_word,
            'roots': filtered_roots,
            'categories': categories
        }

    except Exception as e:
        print(f"Error analyzing word '{word}': {e}")
        return None


In [5]:

def tag_sanskrit_text(text):
    words = re.findall(r'[\u0900-\u097F]+[।॥]?|[।॥]', text)

    tagged_words = []

    for word in words:
        analysis = analyze_sanskrit_word(word)

        if analysis and analysis['roots']:
            root = analysis['roots'][0] if analysis['roots'] else 'unknown'
            category = analysis['categories'][0] if analysis['categories'] else 'unknown'

            tag = f"<root={root}|category={category}>"
            tagged_word = f"{word}{tag}"
        else:
            tagged_word = f"{word}<root=unknown|category=unknown>"

        tagged_words.append(tagged_word)

    return ' '.join(tagged_words)


In [6]:

def process_sanskrit_file(input_file, output_file):
    """Process a Sanskrit text file and create tagged output"""
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            sanskrit_text = f.read()

        lines = sanskrit_text.split('\n')
        tagged_lines = []

        for line in lines:
            if line.strip():
                tagged_line = tag_sanskrit_text(line)
                tagged_lines.append(tagged_line)
            else:
                tagged_lines.append('')

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(tagged_lines))

        print(f"Successfully processed {input_file}")
        print(f"Tagged output saved to {output_file}")

    except Exception as e:
        print(f"Error processing file: {e}")


In [7]:
def quick_tag_word(word):
    """Quick tag a single word"""
    analysis = analyze_sanskrit_word(word)
    if analysis and analysis['roots']:
        root = analysis['roots'][0]
        category = analysis['categories'][0] if analysis['categories'] else 'unknown'
        return f"{word}<root={root}|category={category}>"
    return f"{word}<root=unknown|category=unknown>"


In [18]:
# if __name__ == "__main__":
#     process_sanskrit_file('cleaned_output.txt', 'sanskrit_tagged.txt')

In [15]:
# !wget https://www.sanskritlibrary.org/downloads/tagged_corpus.zip

# !git clone https://github.com/UniversalDependencies/UD_Sanskrit-UFAL

# !pip install sanskrit-data
# !pip install indic_transliteration

Collecting indic_transliteration
  Downloading indic_transliteration-2.3.75-py3-none-any.whl.metadata (1.4 kB)
Collecting backports.functools-lru-cache (from indic_transliteration)
  Downloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting roman (from indic_transliteration)
  Downloading roman-5.1-py3-none-any.whl.metadata (4.2 kB)
Collecting click>=8.0.0 (from typer->indic_transliteration)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Downloading indic_transliteration-2.3.75-py3-none-any.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.6/159.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl (6.7 kB)
Downloading roman-5.1-py3-none-any.whl (5.8 kB)
Downloading click-8.3.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.3/107.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstallin

In [16]:
import requests
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

def download_vedic_corpus():
    """Download complete Vedic Sanskrit corpus"""
    base_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Sanskrit-Vedic/master/"

    files = [
        "sa_vedic-ud-train.conllu",
        "sa_vedic-ud-test.conllu",
        "sa_vedic-ud-dev.conllu"
    ]

    all_content = ""

    for file in files:
        try:
            url = base_url + file
            print(f"Downloading: {file}")
            response = requests.get(url)
            if response.status_code == 200:
                all_content += response.text + "\n"
                print(f"Downloaded: {len(response.text):,} characters")
            else:
                print(f"Failed: {file} - Status {response.status_code}")
        except Exception as e:
            print(f"Error: {e}")

    return all_content


In [17]:

def transliterate_to_devanagari(text):
    try:
        return transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
    except Exception as e:
        print(f" Transliteration warning: {e}")
        return text


In [18]:

def parse_conllu_to_tagged_sentences(conllu_text):
    sentences = []
    current_sentence = []

    lines = conllu_text.split('\n')

    for line in lines:
        line = line.strip()

        if not line or line.startswith('#'):
            continue

        if line == '':
            if current_sentence:
                tagged_sentence = ' '.join(current_sentence)
                sentences.append(tagged_sentence)
                current_sentence = []
            continue

        parts = line.split('\t')
        if len(parts) >= 10:
            word_id = parts[0]
            form = parts[1]
            lemma = parts[2]
            upos = parts[3]
            xpos = parts[4]
            feats = parts[5]
            head = parts[6]
            deprel = parts[7]
            deps = parts[8]
            misc = parts[9]

            devanagari_form = transliterate_to_devanagari(form)

            tagged_word = (
                f"{devanagari_form}"
                f"<id={word_id}"
                f"|form={form}"
                f"|lemma={lemma}"
                f"|upos={upos}"
                f"|xpos={xpos}"
                f"|feats={feats}"
                f"|head={head}"
                f"|deprel={deprel}"
                f"|deps={deps}"
                f"|misc={misc}>"
            )
            current_sentence.append(tagged_word)

    if current_sentence:
        tagged_sentence = ' '.join(current_sentence)
        sentences.append(tagged_sentence)

    return sentences


In [20]:

def save_complete_corpus():
    """Download and save complete Vedic Sanskrit corpus with all linguistic annotations"""
    print("Downloading complete Vedic Sanskrit corpus...")

    conllu_text = download_vedic_corpus()

    if not conllu_text:
        print("No data downloaded")
        return

    print("Parsing and converting Sanskrit words to Devanagari...")
    sentences = parse_conllu_to_tagged_sentences(conllu_text)

    output_file = 'vedic_sanskrit_complete_annotated.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        for sent in sentences:
            f.write(sent + '\n')

    print(f"\nCOMPLETE ANNOTATED CORPUS SAVED!")
    print(f"Total sentences: {len(sentences):,}")
    print(f"File: {output_file}")

    show_corpus_statistics(sentences)


In [21]:

def show_corpus_statistics(sentences):
    """Show detailed corpus statistics"""
    total_words = 0
    pos_counts = {}

    for sent in sentences:
        words = sent.split()
        total_words += len(words)

        for word in words:
            if '|upos=' in word:
                pos_start = word.find('|upos=') + 6
                pos_end = word.find('|', pos_start)
                if pos_end == -1:
                    pos_end = word.find('>', pos_start)
                pos = word[pos_start:pos_end]
                pos_counts[pos] = pos_counts.get(pos, 0) + 1

    print(f"\nCORPUS STATISTICS:")
    print(f"Total sentences: {len(sentences):,}")
    print(f"Total words: {total_words:,}")
    print(f"Unique POS tags: {len(pos_counts)}")

    print(f"\nUniversal POS Tag Distribution:")
    for pos, count in sorted(pos_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {pos}: {count:,}")

    print(f"\nSample sentences with complete annotations:")
    print("=" * 100)
    for i, sent in enumerate(sentences[:3], 1):
        print(f"{i}. {sent}")


In [None]:

def check_dependencies():
    """Check if required packages are installed"""
    try:
        import indic_transliteration
        return True
    except ImportError:
        print("Required package 'indic-transliteration' not found.")
        print("Please install it using: pip install indic-transliteration")
        return False

if __name__ == "__main__":
    if check_dependencies():
        save_complete_corpus()
    else:
        print("Please install the required dependencies and run again.")

Downloading complete Vedic Sanskrit corpus...
Downloading: sa_vedic-ud-train.conllu
Downloaded: 23,071,401 characters
Downloading: sa_vedic-ud-test.conllu
Downloaded: 2,932,660 characters
Downloading: sa_vedic-ud-dev.conllu
Downloaded: 3,368,814 characters
Parsing and converting Sanskrit words to Devanagari...
