In [1]:
import sentencepiece as spm
from collections import Counter

sp = spm.SentencePieceProcessor(model_file='hy_bpe.model')
vocabs = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]

single_chars = [v for v in vocabs if len(v.replace(' ', '')) == 1]
subwords = [v for v in vocabs if 2 <= len(v.replace(' ', '')) <= 4]
full_words = [v for v in vocabs if len(v.replace(' ', '')) >= 5]

print(f"Single Armenian characters: {len(single_chars)}")
print(f"Subword fragments (2-4):    {len(subwords)}")
print(f"Full words (5+):            {len(full_words)}")

with open('corpus.txt', 'r', encoding='utf-8') as f:
    full_text = f.read()

all_pieces = sp.encode_as_pieces(full_text)
most_common = Counter(all_pieces).most_common(10)

print("\n10 Most Frequent Token Pieces:")
for piece, freq in most_common:
    print(f"{piece}: {freq}")

Single Armenian characters: 66
Subword fragments (2-4):    185
Full words (5+):            49

10 Most Frequent Token Pieces:
։: 93
▁է: 53
▁: 41
ն: 35
ան: 33
ի: 33
▁են: 27
ը: 25
ր: 22
ում: 20
