In [3]:
from transformers import AutoTokenizer

model_name = "HiTZ/latxa-7b-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# Vocabulary size
print("Vocabulary size:", tokenizer.vocab_size)

# Special tokens (e.g., padding, unknown, BOS, EOS)
print("Special tokens:", tokenizer.special_tokens_map)

# Example: IDs of special tokens
print("Special token IDs:", {k: tokenizer.convert_tokens_to_ids(v) 
                             for k,v in tokenizer.special_tokens_map.items()})

Vocabulary size: 32000
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}
Special token IDs: {'bos_token': 1, 'eos_token': 2, 'unk_token': 0}


In [5]:
text = "Euskara adimen arttifizialera iritsi da!"

# Encode text â†’ token IDs
token_ids = tokenizer.encode(text)
print("Token IDs:", token_ids)

# Decode back to text
decoded_text = tokenizer.decode(token_ids)
print("Decoded text:", decoded_text)

Token IDs: [1, 382, 17400, 2518, 594, 19933, 564, 698, 21722, 616, 1572, 3805, 277, 1039, 1146, 29991]
Decoded text: <s> Euskara adimen arttifizialera iritsi da!


In [6]:
# Save vocabulary to TSV file
vocab = tokenizer.get_vocab()  # dict: token -> id
print(f"Vocabulary loaded, {len(vocab)} tokens found.")
vocab_sorted = sorted(vocab.items(), key=lambda x: x[1])
OUTPUT_FILE = "latxa-7b-v1.1-vocab.tsv"

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.write("token\tid\n")
    for token, idx in vocab_sorted:
        f.write(f"{token}\t{idx}\n")

print(f"Vocabulary saved to {OUTPUT_FILE}")

Vocabulary loaded, 32000 tokens found.
Vocabulary saved to latxa-7b-v1.1-vocab.tsv


Tokens per word

In [7]:
import re
from collections import Counter
import pandas as pd

def avg_tokens_per_word(sentences):
    token_counts = []
    word_counts = []

    for sent in sentences:
        tokens = tokenizer.tokenize(sent)
        words = sent.split()

        token_counts.append(len(tokens))
        word_counts.append(len(words))

    avg_tokens_word = sum(token_counts) / sum(word_counts)
    avg_tokens_sentence = sum(token_counts) / len(sentences)

    return avg_tokens_word, avg_tokens_sentence

In [10]:
data_dir = "../data/eus_latn_data.csv"
df = pd.read_csv(data_dir)
sentences = df['sentence'].tolist()

avg_word, avg_sentence = avg_tokens_per_word(sentences)

print(f"Average tokens per word: {avg_word:.3f}")
print(f"Average tokens per sentence: {avg_sentence:.2f}")

Average tokens per word: 3.086
Average tokens per sentence: 43.36


Latxa tokenizer vs. Llama2 tokenizer