In [54]:
import torch
import re
import sentencepiece as spm
import random

from collections import Counter

In [51]:
config = {
    'train_en_path': './dataset/consolidated/train.en',
    'train_ne_path': './dataset/consolidated/train.ne',
    'default_vocab_size': 15000
}

In [48]:
with open(config['train_en_path'], 'r', encoding='utf-8') as f:
    en_lines = f.readlines()
    
with open(config['train_ne_path'], 'r', encoding='utf-8') as f:
    ne_lines = f.readlines()
    
assert len(en_lines) == len(ne_lines)

In [50]:
def count_unique_words(sentences):
    unique_words = set()
    counter_ = Counter()
    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence.lower())
        unique_words.update(words)
        counter_.update(words)
    
    return len(unique_words), unique_words, counter_

count, unique_words, counter = count_unique_words(en_lines)
print(f"Number of unique ENGLISH words: {count}")
count, unique_words, counter = count_unique_words(ne_lines)
print(f"Number of unique NEPALI words: {count}")

Number of unique ENGLISH words: 52751
Number of unique NEPALI words: 20423


In [53]:
# training BPE sentencepiece tokenizer for both EN and NE corpus
spm.SentencePieceTrainer.train(input = config['train_en_path'], model_prefix = 'en_bpe_model', vocab_size = config['default_vocab_size'], model_type = 'bpe')

spm.SentencePieceTrainer.train(input = config['train_ne_path'], model_prefix = 'ne_bpe_model', vocab_size = config['default_vocab_size'], model_type = 'bpe')

In [61]:
# test
sp_en = spm.SentencePieceProcessor(model_file = 'en_bpe_model.model')
sp_ne = spm.SentencePieceProcessor(model_file = 'ne_bpe_model.model')

idx = random.randint(0, len(en_lines))

en_translation = en_lines[idx]
np_translation = ne_lines[idx]

print(en_translation, '\n', np_translation)
print(sp_en.encode(en_translation, out_type = str))
print(sp_ne.encode(np_translation, out_type = str))

print(sp_en.encode(en_translation, out_type = int))
print(sp_ne.encode(np_translation, out_type = int))


sp_en.piece_to_id('<pad>'), sp_ne.piece_to_id('<pad>'), sp_en.piece_to_id('<s>'), sp_ne.piece_to_id('<s>'), sp_en.piece_to_id('</s>'), sp_ne.piece_to_id('</s>')


Open the selected item in this window
 
 चयन गरिएको वस्तु यस सञ्झ्यालमा खोल्नुहोस्

['▁Open', '▁the', '▁selected', '▁item', '▁in', '▁this', '▁window']
['▁चयन', '▁गरिएको', '▁वस्तु', '▁यस', '▁सञ्झ्यालमा', '▁खोल्नुहोस्']
[1489, 7, 675, 2195, 32, 142, 686]
[347, 246, 523, 92, 3223, 2419]


(0, 0, 1, 1, 2, 2)

In [None]:
encoded_pieces = sp.encode("<pad> This is a test sentence.", out_type = str)
encoded_ids = sp.encode("<pad> This is a test sentence.", out_type = int)
print(encoded_pieces, encoded_ids)


decoded_sentence = sp.decode(encoded_pieces)
print(decoded_sentence)

sp.piece_to_id('<pad>')