In [3]:
from datasets import load_dataset
ds = load_dataset(path="ai4bharat/IndicCorpV2", split= "hin_Deva", streaming="True")

In [1]:
import re
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [4]:
def sentence_tokenizer(text):
    sentence_split = re.compile(r'(?<=[।!?])\s+|(?<=[.!?])\s+')
    sentences = sentence_split.split(text.strip())
    return [s.strip() for s in sentences if s.strip()]

def word_tokenizer(sentence):
    word_pattern = re.compile(
        r'[\u0900-\u097F]+|'
        r'\d+\.\d+|'
        r'\d+|'
        r'[\w\.-]+@[\w\.-]+|'
        r'\w+://\S+|'
        r'[^\s\w]',
        re.UNICODE
    )
    return word_pattern.findall(sentence)

def tokenize_paragraph(paragraph):
    sentences = sentence_tokenizer(paragraph)
    return [word_tokenizer(sent) for sent in sentences]

In [5]:
text = "मैं बाजार गया। फिर मैंने खाना खाया! What about you? Visit http://example.com"
tokenized = tokenize_paragraph(text)

for i, sent in enumerate(tokenized, 1):
    print(f"Sentence {i}: {sent}")


Sentence 1: ['मैं', 'बाजार', 'गया।']
Sentence 2: ['फिर', 'मैंने', 'खाना', 'खाया', '!']
Sentence 3: ['?']
Sentence 4: ['http://example.com']


In [None]:
import json

paragraphs = []
count_written = 0

for i, item in enumerate(ds):
    if count_written >= 1000:
        break

    text = item.get("text", "").strip()
    if not text:
        continue

    sentence_texts = sentence_tokenizer(text)
    paragraph = {
        "paragraph_id": count_written,
        "sentences": []
    }

    for sent_text in sentence_texts:
        tokens = word_tokenizer(sent_text)
        if tokens:
            paragraph["sentences"].append({
                "text": sent_text,
                "tokens": tokens
            })

    if paragraph["sentences"]:
        paragraphs.append(paragraph)
        count_written += 1

    if i % 100 == 0:
        print(f"Scanned {i} entries, written {count_written} paragraphs...")

# Save to JSON file
output_file = "tokenized_hi.json"
with open(output_file, "w", encoding="utf-8") as fout:
    json.dump(paragraphs, fout, ensure_ascii=False, indent=2)

print(f"\nDone. Written {count_written} paragraphs to {output_file}")


Scanned 0 entries, written 1 paragraphs...
Scanned 100 entries, written 51 paragraphs...
Scanned 200 entries, written 101 paragraphs...
Scanned 300 entries, written 151 paragraphs...
Scanned 400 entries, written 201 paragraphs...
Scanned 500 entries, written 251 paragraphs...
Scanned 600 entries, written 301 paragraphs...
Scanned 700 entries, written 351 paragraphs...
Scanned 800 entries, written 401 paragraphs...
Scanned 900 entries, written 451 paragraphs...
Scanned 1000 entries, written 501 paragraphs...
Scanned 1100 entries, written 551 paragraphs...
Scanned 1200 entries, written 601 paragraphs...
Scanned 1300 entries, written 651 paragraphs...
Scanned 1400 entries, written 701 paragraphs...
Scanned 1500 entries, written 751 paragraphs...
Scanned 1600 entries, written 801 paragraphs...
Scanned 1700 entries, written 851 paragraphs...
Scanned 1800 entries, written 901 paragraphs...
Scanned 1900 entries, written 951 paragraphs...

✅ Done. Written 1000 paragraphs to tokenized_hi.json
