In [1]:
import datasets
import transformers
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [2]:
lang = ['amh', 'hau', 'ibo', 'arq', 'ary', 'yor', 'por', 'twi', 'tso', 'tir', 'orm', 'pcm', 'kin', 'swa']

In [3]:
ds_list = []

for l in lang:
    try:
        ds = datasets.load_dataset('shmuhammad/AfriSenti-twitter-sentiment', l)
        ds_list.append(ds)
    except:
        print(f'error at {l}')

error at tir
error at orm


In [4]:
len(ds_list)

12

In [5]:
count = 0

for ds in ds_list:
    for split in ['train', 'test', 'validation']:
        if split in ds:
            count += ds[split].num_rows

count

106828

In [6]:
full_text = []
    
for dataset in ds_list:
    for split in ['train', 'test', 'validation']:
        if split in dataset:
            texts = dataset[split]['tweet']
            full_text.extend(texts)

len(full_text)

106828

In [7]:
with open('text_dump.txt', 'w') as f:
    for text in full_text:
        f.write(text + '\n')

In [8]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input='text_dump.txt',
    model_prefix='afrisenti',
    vocab_size=110000,
    model_type='unigram',
    character_coverage=0.9995,
    user_defined_symbols=['<pad>', '<mask>','<sep>','<cls>', '<s>', '</s>']
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: text_dump.txt
  input_format: 
  model_prefix: afrisenti
  model_type: UNIGRAM
  vocab_size: 110000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <pad>
  user_defined_symbols: <mask>
  user_defined_symbols: <sep>
  user_defined_symbols: <cls>
  user_defined_symbols: <s>
  user_defined_symbols: </s>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0

In [9]:
sp = spm.SentencePieceProcessor()
sp.load('afrisenti.model')

def analyze_coverage(text):
    tokens = sp.encode(text, out_type=str)
    unk_count = tokens.count('[UNK]')
    return 1 - (unk_count / len(tokens))

with open('text_dump.txt', 'r') as f:
    texts = f.readlines()

coverage = sum(analyze_coverage(text) for text in texts) / len(texts)
print(f"Token coverage: {coverage:.2%}")

Token coverage: 100.00%


In [10]:
# sentencepiece doesnt work with electra - kill me

In [11]:
from tokenizers import ByteLevelBPETokenizer
from transformers import XLMRobertaTokenizerFast
from datasets import load_dataset
import os

def get_training_corpus(dataset, batch_size=1000):
    for start_idx in range(0, len(dataset), batch_size):
        samples = dataset[start_idx : start_idx + batch_size]
        yield samples["text"]

dataset = load_dataset("text", data_files={"train": "text_dump.txt"})["train"]

dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 106828
})

In [None]:
# trying with sentencepiece from transformers

In [23]:
from tokenizers import SentencePieceBPETokenizer

tokenizer = SentencePieceBPETokenizer()

tokenizer.train_from_iterator(
    get_training_corpus(dataset), 
    vocab_size=120000, 
    min_frequency=2
)






In [24]:
tokenizer

Tokenizer(vocabulary_size=120000, model=SentencePieceBPE, unk_token=<unk>, replacement=▁, add_prefix_space=True, dropout=None)

In [25]:
xlmr_tokenizer = XLMRobertaTokenizerFast(
    tokenizer_object=tokenizer
)

In [27]:
os.makedirs('xlmr-tokenizer', exist_ok=True)
xlmr_tokenizer.save_pretrained('xlmr-tokenizer')

('xlmr-tokenizer/tokenizer_config.json',
 'xlmr-tokenizer/special_tokens_map.json',
 'xlmr-tokenizer/tokenizer.json')