In [1]:
import glob
import os
import tokenizers
from tokenizers import Tokenizer

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!mkdir corpus
!kaggle datasets download -d disisbig/hindi-wikipedia-articles-172k
!unzip /content/hindi-wikipedia-articles-172k.zip -d /content/corpus

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/corpus/valid/valid/77096.txt  
  inflating: /content/corpus/valid/valid/7710.txt  
  inflating: /content/corpus/valid/valid/77100.txt  
  inflating: /content/corpus/valid/valid/77102.txt  
  inflating: /content/corpus/valid/valid/77105.txt  
  inflating: /content/corpus/valid/valid/77118.txt  
  inflating: /content/corpus/valid/valid/77134.txt  
  inflating: /content/corpus/valid/valid/77135.txt  
  inflating: /content/corpus/valid/valid/7714.txt  
  inflating: /content/corpus/valid/valid/77142.txt  
  inflating: /content/corpus/valid/valid/77144.txt  
  inflating: /content/corpus/valid/valid/77145.txt  
  inflating: /content/corpus/valid/valid/77157.txt  
  inflating: /content/corpus/valid/valid/77164.txt  
  inflating: /content/corpus/valid/valid/77167.txt  
  inflating: /content/corpus/valid/valid/77169.txt  
  inflating: /content/corpus/valid/valid/77175.txt  
  inflating: /content/corpus/valid/v

In [3]:
train_files = glob.glob("/content/corpus/train/train/*.txt")
valid_files = glob.glob("/content/corpus/valid/valid/*.txt")

def concatenate_files(file_list, output_path):

    try:
        with open(output_path, "wb") as outfile:
            for file_path in file_list:
                try:
                    with open(file_path, "rb") as infile:
                        outfile.write(infile.read())
                        outfile.write(b"\n\n")
                except FileNotFoundError:
                    print(f"File not found: {file_path}")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
        print(f"Successfully concatenated files into {output_path}")
    except Exception as e:
        print(f"Error writing to {output_path}: {e}")


concatenate_files(train_files, "/content/corpus/train/full.txt")
concatenate_files(valid_files, "/content/corpus/valid/full_val.txt")

Successfully concatenated files into /content/corpus/train/full.txt
Successfully concatenated files into /content/corpus/valid/full_val.txt


In [4]:
corpus_file = "/content/corpus/train/full.txt"
tokenizer = tokenizers.SentencePieceBPETokenizer()
tokenizer.train(corpus_file)
tokenizer.save("bpe_tokenizer.json")

In [5]:
tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

def compression_ratio(text, tokenizer):
    encoded = tokenizer.encode(text)
    compressed_length = len(encoded.ids)
    original_length = len(text)
    return original_length / compressed_length

sample_text = "बच्चे पार्क में खेल रहे हैं और बहुत खुश हैं।"
encoded_text = tokenizer.encode(sample_text)
compression = compression_ratio(sample_text, tokenizer)

print(f"Encoded text: {encoded_text.tokens}")
print(f"Vocabulary size: {len(tokenizer.get_vocab())}")
print(f"Compression ratio: {compression}")
print(f"Decoded text: {tokenizer.decode(encoded_text.ids)}")

Encoded text: ['▁बच्चे', '▁पार्क', '▁में', '▁खेल', '▁रहे', '▁हैं', '▁और', '▁बहुत', '▁खुश', '▁हैं।']
Vocabulary size: 30000
Compression ratio: 4.4
Decoded text: बच्चे पार्क में खेल रहे हैं और बहुत खुश हैं।
