In [1]:
import torch
import re
import sentencepiece as spm
import random

from collections import Counter

In [2]:
config = {
    'train_en_path': './dataset/consolidated/train.en',
    'train_ne_path': './dataset/consolidated/train.ne',
    'default_vocab_size': 15000
}

In [3]:
with open(config['train_en_path'], 'r', encoding='utf-8') as f:
    en_lines = f.readlines()
    
with open(config['train_ne_path'], 'r', encoding='utf-8') as f:
    ne_lines = f.readlines()
    
assert len(en_lines) == len(ne_lines)

In [4]:
def count_unique_words(sentences):
    unique_words = set()
    counter_ = Counter()
    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence.lower())
        unique_words.update(words)
        counter_.update(words)
    
    return len(unique_words), unique_words, counter_

count, unique_words, counter = count_unique_words(en_lines)
print(f"Number of unique ENGLISH words: {count}")
count, unique_words, counter = count_unique_words(ne_lines)
print(f"Number of unique NEPALI words: {count}")

Number of unique ENGLISH words: 52751
Number of unique NEPALI words: 20423


In [5]:
# training BPE sentencepiece tokenizer for both EN and NE corpus
spm.SentencePieceTrainer.train(input = config['train_en_path'], model_prefix = 'en_bpe_model', vocab_size = config['default_vocab_size'], model_type = 'bpe', user_defined_symbols = ['<pad>'])

spm.SentencePieceTrainer.train(input = config['train_ne_path'], model_prefix = 'ne_bpe_model', vocab_size = config['default_vocab_size'], model_type = 'bpe', user_defined_symbols = ['<pad>'])

In [6]:
# test
sp_en = spm.SentencePieceProcessor(model_file = 'en_bpe_model.model')
sp_ne = spm.SentencePieceProcessor(model_file = 'ne_bpe_model.model')

idx = random.randint(0, len(en_lines))

en_translation = en_lines[idx]
np_translation = ne_lines[idx]

print(en_translation, '\n', np_translation)
print(sp_en.encode(en_translation, out_type = str))
print(sp_ne.encode(np_translation, out_type = str))

print(sp_en.encode(en_translation, out_type = int))
print(sp_ne.encode(np_translation, out_type = int))


sp_en.piece_to_id('<pad>'), sp_ne.piece_to_id('<pad>'), sp_en.piece_to_id('<s>'), sp_ne.piece_to_id('<s>'), sp_en.piece_to_id('</s>'), sp_ne.piece_to_id('</s>'), sp_en.piece_to_id('<unk>'), sp_en.piece_to_id('<unk>')


Only preliminary works could be undertaken towards the establishment of the proposed textile plant at Butwal.
 
 वुटवलमा सूती धागो उद्योग स्थापना गर्ने तर्फ प्रारम्भिक कार्य मात्र हुन सकेको छ

['▁Only', '▁preliminary', '▁works', '▁could', '▁be', '▁undertaken', '▁towards', '▁the', '▁establishment', '▁of', '▁the', '▁proposed', '▁textile', '▁plant', '▁at', '▁Butwal', '.']
['▁वु', 'ट', 'व', 'लमा', '▁सू', 'ती', '▁धागो', '▁उद्योग', '▁स्थापना', '▁गर्ने', '▁तर्फ', '▁प्रारम्भिक', '▁कार्य', '▁मात्र', '▁हुन', '▁सकेको', '▁छ']
[2344, 6972, 1165, 726, 48, 3143, 3020, 8, 3623, 20, 8, 3656, 12594, 2068, 197, 9881, 14938]
[10929, 14882, 14874, 817, 2110, 317, 6875, 1050, 829, 188, 1961, 4177, 164, 306, 345, 2179, 37]


(3, 3, 1, 1, 2, 2, 0, 0)

In [None]:
encoded_pieces = sp.encode("<pad> This is a test sentence.", out_type = str)
encoded_ids = sp.encode("<pad> This is a test sentence.", out_type = int)
print(encoded_pieces, encoded_ids)


decoded_sentence = sp.decode(encoded_pieces)
print(decoded_sentence)

sp.piece_to_id('<pad>')