In [1]:
import sentencepiece as spm
import random
import re

In [2]:
def remove_non_punjabi_chars(text):
    punjabi_chars = r"[\u0A01-\u0A7F\u0A80-\u0A8F,।0-9? \n‘:]"  # Gurmukhi range
    english_chars = r"[a-zA-Z]"  # English alphabet range
    text =  re.sub(r"[^" + punjabi_chars +"|"+ english_chars + "]+", " ", text) 
    text = re.sub(r" +", " ", text)
    text = re.sub(r"\n+", "\n", text)
    return text

In [3]:
with open('../data/pa.txt') as f:
    data = f.read()

In [4]:
cleaned_data = remove_non_punjabi_chars(data)

In [5]:
with open('../data/clean_pa.txt', 'w') as f:
    f.write(cleaned_data)

In [7]:
%%time
spm.SentencePieceTrainer.train(
    input='../data/clean_pa.txt',
    model_prefix='pure_punjabi_tokenizer',  
    model_type='unigram',
    vocab_size=16000,  # Adjust based on your corpus size and language complexity
    character_coverage=0.9995,
    input_sentence_size=1000000,
    shuffle_input_sentence=True,
    normalization_rule_name='nmt_nfkc_cf'
)

CPU times: user 1min 5s, sys: 1.31 s, total: 1min 7s
Wall time: 36.5 s


sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/clean_pa.txt
  input_format: 
  model_prefix: pure_punjabi_tokenizer
  model_type: UNIGRAM
  vocab_size: 16000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 1000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0

In [8]:
sp = spm.SentencePieceProcessor()
sp.load('pure_punjabi_tokenizer.model')

# Print vocabulary size
print(f"Vocabulary size: {sp.get_piece_size()}")

# Print some tokens
print("Sample tokens:")
for i in range(100):  # Print first 100 tokens
    print(sp.id_to_piece(i))

# Try tokenizing a Punjabi sentence
punjabi_sentence = "ਪੰਜਾਬੀ ਇੱਕ ਬਹੁਤ ਸੁੰਦਰ ਭਾਸ਼ਾ ਹੈ"
tokens = sp.encode_as_pieces(punjabi_sentence)
print(f"Tokenized: {tokens}")

Vocabulary size: 16000
Sample tokens:
<unk>
<s>
</s>
,
▁ਦੇ
।
▁ਦੀ
▁ਹੈ
▁ਨੂੰ
▁ਦਾ
▁ਨੇ
▁ਅਤੇ
▁ਤੇ
▁ਵਿੱਚ
▁ਲਈ
▁ਨਾਲ
▁ਤੋਂ
▁‘
▁ਸਿੰਘ
▁ਇਸ
▁ਹਨ
▁ਚ
▁ਕਿ
▁ਵਿਚ
▁ਕੇ
▁ਵੀ
▁ਸੀ
ੀ
▁ਨਹੀਂ
▁ਕਰਨ
▁ਇਹ
▁
ਚ
▁ਹੋ
ਾਂ
:
▁ਹੀ
▁ਇੱਕ
▁ਕਰ
▁:
▁ਪੰਜਾਬ
▁ਕੀਤਾ
▁ਉਹ
▁ਉਸ
ਾ
▁ਤਾਂ
▁ਗਿਆ
▁ਜੀ
▁ਆਪਣੇ
ੇ
ਤੇ
▁ਇਕ
▁ਵੱਲੋਂ
▁ਦੀਆਂ
▁ਕੀਤੀ
▁ਜੋ
▁,
▁ਉਨ੍ਹਾਂ
▁ਕਰੋ
▁ਪਰ
▁ਗੁਰੂ
▁ਨਾ
▁ਹੋਰ
▁ਸਾਹਿਬ
▁ਜਾ
▁ਵਾਲੇ
?
▁ਕੋਈ
▁ਰਹੇ
▁ਸਰਕਾਰ
▁ਮੰਤਰੀ
▁ਵੀਡੀਓ
▁ਮੈਂ
▁ਕੀ
▁ਗਈ
▁ਕਿਹਾ
▁ਹੋਏ
▁।
▁ਅੱਜ
▁ਗਏ
▁ਵਲੋਂ
▁ਜਿਸ
ਨ
▁ਪੰਜਾਬੀ
▁ਦਿੱਤਾ
▁ਬਾਰੇ
▁ਭਾਰਤ
▁ਲੋਕਾਂ
▁ਸਿੱਖ
▁ਵਿਖੇ
▁ਰਿਹਾ
▁ਤੱਕ
▁ਹੋਣ
▁ਹਾਂ
▁ਹੋਈ
ਸ
▁ਦਿਨ
▁ਕਿਸੇ
▁ਸਭ
▁ਹੁਣ
Tokenized: ['▁ਪੰਜਾਬੀ', '▁ਇੱਕ', '▁ਬਹੁਤ', '▁ਸੁੰਦਰ', '▁ਭਾਸ਼ਾ', '▁ਹੈ']


In [10]:
ind = random.randint(0, len(data)-100)
punj_text = data[ind:ind+100]

print(f'Punjabi text: {punj_text}')

tokens = sp.encode_as_pieces(punj_text)
print(f"Tokenized: {tokens}")


Punjabi text: ਾਲ ਮੁੱਦੇ ‘ਤੇ ਪਹਿਲੀ ਵਾਰ ਸਾਹਮਣੇ ਆਇਆ ਪੀ. ਐੱਮ. ਟਰੂਡੋ ਦੀ ਪਤਨੀ ਸੋਫੀ ਦਾ ਬਿਆਨ
Comments Off on ਅਟਵਾਲ ਮੁੱਦੇ ‘ਤ
Tokenized: ['▁', 'ਾਲ', '▁ਮੁੱਦੇ', '▁‘', 'ਤੇ', '▁ਪਹਿਲੀ', '▁ਵਾਰ', '▁ਸਾਹਮਣੇ', '▁ਆਇਆ', '▁ਪੀ', '.', '▁ਐੱਮ', '.', '▁ਟਰੂਡੋ', '▁ਦੀ', '▁ਪਤਨੀ', '▁ਸੋ', 'ਫੀ', '▁ਦਾ', '▁ਬਿਆਨ', '▁', 'comments', '▁', 'off', '▁', 'on', '▁ਅਟਵਾਲ', '▁ਮੁੱਦੇ', '▁‘', 'ਤ']
