In [2]:
from datasets import load_dataset
ds = load_dataset(path="ai4bharat/IndicCorpV2", split= "hin_Deva", streaming="True")

In [3]:
import re
import pandas as pd

In [5]:
def sentence_tokenizer(text):
    sentence_split = re.compile(r'(?<=[।!?])\s+|(?<=[.!?])\s+')
    sentences = sentence_split.split(text.strip())
    return [s.strip() for s in sentences if s.strip()]

def word_tokenizer(sentence):
    word_pattern = re.compile(
        r'[\u0900-\u097F]+|'
        r'\d+\.\d+|' #floating point
        r'\d+|' #integer
        r'[\w\.-]+@[\w\.-]+|' #email-address
        # r'[\w\.-]+@(\w+\.)+(com|in|org)|'
        r'\w+://\S+|' #URL
        r'[^\s\w]', #punctuation and special characters
        re.UNICODE #to match hindi chars
    )
    return word_pattern.findall(sentence)

def tokenize_paragraph(paragraph):
    sentences = sentence_tokenizer(paragraph)
    return [word_tokenizer(sent) for sent in sentences]

In [6]:
text = "मैं बाजार गया। फिर मैंने खाना खाया! What about you? Visit http://example.com"
tokenized = tokenize_paragraph(text)

for i, sent in enumerate(tokenized, 1):
    print(f"Sentence {i}: {sent}")


Sentence 1: ['मैं', 'बाजार', 'गया।']
Sentence 2: ['फिर', 'मैंने', 'खाना', 'खाया', '!']
Sentence 3: ['?']
Sentence 4: ['http://example.com']


In [13]:
import json

paragraphs = []
count = 0

for i, item in enumerate(ds):
    if count >= 1000000:
        break

    text = item.get("text", "").strip()
    if not text:
        continue

    sentence_texts = sentence_tokenizer(text)
    paragraph = {
        "paragraph_id": count,
        "sentences": []
    }

    for sent_text in sentence_texts:
        tokens = word_tokenizer(sent_text)
        if tokens:
            paragraph["sentences"].append({
                "text": sent_text,
                "tokens": tokens
            })

    if paragraph["sentences"]:
        paragraphs.append(paragraph)
        count += 1

    if i % 100 == 0:
        print(f"{count} paragraphs done")

output_file = "tokenized_hi.json"
with open(output_file, "w", encoding="utf-8") as fout:
    json.dump(paragraphs, fout, ensure_ascii=False, indent=2)

print(f"\nDone. Written {count} paragraphs to {output_file}")

1 paragraphs done
51 paragraphs done
101 paragraphs done
151 paragraphs done
201 paragraphs done
251 paragraphs done
301 paragraphs done
351 paragraphs done
401 paragraphs done
451 paragraphs done
501 paragraphs done
551 paragraphs done
601 paragraphs done
651 paragraphs done
701 paragraphs done
751 paragraphs done
801 paragraphs done
851 paragraphs done
901 paragraphs done
951 paragraphs done
1001 paragraphs done
1051 paragraphs done
1101 paragraphs done
1151 paragraphs done
1201 paragraphs done
1251 paragraphs done
1301 paragraphs done
1351 paragraphs done
1401 paragraphs done
1451 paragraphs done
1501 paragraphs done
1551 paragraphs done
1601 paragraphs done
1651 paragraphs done
1701 paragraphs done
1751 paragraphs done
1801 paragraphs done
1851 paragraphs done
1901 paragraphs done
1951 paragraphs done
2001 paragraphs done
2051 paragraphs done
2101 paragraphs done
2151 paragraphs done
2201 paragraphs done
2251 paragraphs done
2301 paragraphs done
2351 paragraphs done
2401 paragraphs

In [12]:
with open("tokenized_hi.json", "r", encoding="utf-8") as f:
    data = json.load(f)

total_sentences = 0
total_words = 0
total_characters = 0
unique_tokens = set()

for para in data:
    for sentence in para["sentences"]:
        tokens = sentence["tokens"]
        total_sentences += 1
        total_words += len(tokens)
        total_characters += sum(len(token) for token in tokens)
        unique_tokens.update(tokens)

avg_sentence_length = total_words / total_sentences if total_sentences else 0
avg_word_length = total_characters / total_words if total_words else 0
ttr = len(unique_tokens) / total_words if total_words else 0

print("Corpus Statistics:")
print(f"1)   Total number of sentences       : {total_sentences}")
print(f"2)  Total number of words           : {total_words}")
print(f"3) Total number of characters      : {total_characters}")
print(f"4)  Average sentence length         : {avg_sentence_length:.2f} words/sentence")
print(f"5)   Average word length             : {avg_word_length:.2f} characters/word")
print(f"6)  Type/Token Ratio (TTR)          : {ttr:.4f}")

Corpus Statistics:
1)   Total number of sentences       : 327915
2)  Total number of words           : 6128133
3) Total number of characters      : 23542288
4)  Average sentence length         : 18.69 words/sentence
5)   Average word length             : 3.84 characters/word
6)  Type/Token Ratio (TTR)          : 0.0236


In [None]:
import ijson
import pyarrow as pa
import pyarrow.parquet as pq

input_file = "tokenized_hi.json"
output_file = "tokenized_hi.parquet"

# Define schema (adjust if needed)
schema = pa.schema([
    ("paragraph_id", pa.int64()),
    ("sentences", pa.list_(pa.struct([
        ("text", pa.string()),
        ("tokens", pa.list_(pa.string()))
    ])))
])

# Create Parquet writer
writer = pq.ParquetWriter(output_file, schema)

batch_size = 1000
buffer = {"paragraph_id": [], "sentences": []}

with open(input_file, "r", encoding="utf-8") as f:
    objects = ijson.items(f, "item")  # "item" iterates over array elements

    for i, obj in enumerate(objects, 1):
        buffer["paragraph_id"].append(obj["paragraph_id"])
        buffer["sentences"].append(obj["sentences"])

        # Write in batches to save memory
        if i % batch_size == 0:
            table = pa.table(buffer, schema=schema)
            writer.write_table(table)
            buffer = {"paragraph_id": [], "sentences": []}
            print(f"{i} objects written...")

# Write leftovers
if buffer["paragraph_id"]:
    table = pa.table(buffer, schema=schema)
    writer.write_table(table)

writer.close()
print(f"Conversion complete → {output_file}")

1000 objects written...
2000 objects written...
3000 objects written...
4000 objects written...
5000 objects written...
6000 objects written...
7000 objects written...
8000 objects written...
9000 objects written...
10000 objects written...
11000 objects written...
12000 objects written...
13000 objects written...
14000 objects written...
15000 objects written...
16000 objects written...
17000 objects written...
18000 objects written...
19000 objects written...
20000 objects written...
21000 objects written...
22000 objects written...
23000 objects written...
24000 objects written...
25000 objects written...
26000 objects written...
27000 objects written...
28000 objects written...
29000 objects written...
30000 objects written...
31000 objects written...
32000 objects written...
33000 objects written...
34000 objects written...
35000 objects written...
36000 objects written...
37000 objects written...
38000 objects written...
39000 objects written...
40000 objects written...
41000 obj

In [None]:
import pyarrow.parquet as pq

table = pq.read_table("tokenized_hi.parquet")
df = table.to_pandas()

# Show first row’s sentences
print(df.iloc[0]["sentences"])

# Show the tokens of the first sentence in the first paragraph
print(df.iloc[0]["sentences"][0]["tokens"])

# Loop over first few paragraphs
for i in range(3):
    print(f"\nParagraph {df.iloc[i]['paragraph_id']}:")
    for sent in df.iloc[i]["sentences"]:
        print("  Text:", sent["text"])
        print("  Tokens:", sent["tokens"])

[{'text': 'लोगों को बिलों संबंधी सुविधा देना ही उनका काम', 'tokens': array(['लोगों', 'को', 'बिलों', 'संबंधी', 'सुविधा', 'देना', 'ही', 'उनका',
        'काम'], dtype=object)}                                                                                                               ]
['लोगों' 'को' 'बिलों' 'संबंधी' 'सुविधा' 'देना' 'ही' 'उनका' 'काम']

Paragraph 0:
  Text: लोगों को बिलों संबंधी सुविधा देना ही उनका काम
  Tokens: ['लोगों' 'को' 'बिलों' 'संबंधी' 'सुविधा' 'देना' 'ही' 'उनका' 'काम']

Paragraph 1:
  Text: इनेलो 1987 में उस वक्त ऐसे ही दोराहे पर खड़ी थी, जब पूर्व उपप्रधानमंत्री देवीलाल ने अपने पुत्र ओमप्रकाश चौटाला को अपना राजनीतिक उत्तराधिकारी घोषित किया था।
  Tokens: ['इनेलो' '1987' 'में' 'उस' 'वक्त' 'ऐसे' 'ही' 'दोराहे' 'पर' 'खड़ी' 'थी' ','
 'जब' 'पूर्व' 'उपप्रधानमंत्री' 'देवीलाल' 'ने' 'अपने' 'पुत्र' 'ओमप्रकाश'
 'चौटाला' 'को' 'अपना' 'राजनीतिक' 'उत्तराधिकारी' 'घोषित' 'किया' 'था।']
  Text: हालांकि तब पार्टी पर देवीलाल की मजबूत पकड़ के चलते पार्टी टूटने से बच गई थी।
  Tokens: ['हाल