In [15]:
from datasets import load_dataset
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, Seq2SeqTrainingArguments, WhisperForConditionalGeneration, Seq2SeqTrainer

import torch
import time
import json

from dataclasses import dataclass
from typing import Any, Dict, List, Union

import evaluate

In [18]:
with open("whisper-large-finetuned_experiment-1_checkpoint-2000_fleurs-test.jsonl", "r") as f:
        with open("whisper-large-finetuned_experiment-1_checkpoint-2000_fleurs-hausa-test.jsonl", "w") as out:
            for line in f:
                obj = json.loads(line)
                json.dump(obj[0], out)
                out.write("\n")

In [16]:
cache_dir="/data/users/kashrest/asr-experiments"
data = load_dataset("google/fleurs", "ha_ng", split="test", cache_dir=cache_dir)
labels = []
with open("fleurs_hausa_test.jsonl", "w") as f:
    for elem in (iter(data)):
        transcript = elem["transcription"]
        json.dump(transcript, f)
        f.write("\n")

Found cached dataset fleurs (/data/users/kashrest/asr-experiments/google___fleurs/ha_ng/2.0.0/af82dbec419a815084fa63ebd5d5a9f24a6e9acdf9887b9e3b8c6bbd64e0b7ac)


In [None]:
tokenizer_igbo = WhisperTokenizer.from_pretrained("openai/whisper-small", language="igbo", task="transcribe")

tokenizer_igbo.decode(tokenizer_igbo.encode("hello"))

# Hausa 

The Whisper tokenizer has 169 characters in its vocabulary from the data of"96" non-English languages it was pretrained on: https://cdn.openai.com/papers/whisper.pdf

In [11]:
tokenizer_hausa = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Hausa", task="transcribe")
char_vocab_whisper = ""
for key in tokenizer_hausa.get_vocab().keys():
    char_vocab_whisper += " ".join(key.lower())

print(sorted(set(char_vocab_whisper)))
"""from collections import Counter
char_vocab_whisper = Counter(list(char_vocab_whisper))
print(f"In Whisper, the tokenizer vocabulary has {len(char_vocab_whisper.keys())} characters.\n{char_vocab_whisper.most_common()}")"""

[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', '×', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'ā', 'ă', 'ą', 'ć', 'ĉ', 'ċ', 'č', 'ď', 'đ', 'ē', 'ĕ', 'ė', 'ę', 'ě', 'ĝ', 'ğ', 'ġ', 'ģ', 'ĥ', 'ħ', 'ĩ', 'ī', 'ĭ', 'į', 'ı', 'ĳ', 'ĵ', 'ķ', 'ĸ', 'ĺ', 'ļ', 'ľ', 'ŀ', 'ł', 'ń', '̇']


'from collections import Counter\nchar_vocab_whisper = Counter(list(char_vocab_whisper))\nprint(f"In Whisper, the tokenizer vocabulary has {len(char_vocab_whisper.keys())} characters.\n{char_vocab_whisper.most_common()}")'

For Hausa, setting the language makes sure the input to the model is prefixed by the language prefix (i.e. "ha"), and setting the task adds the task prefix (i.e. "transcribe")

In [12]:
tokenizer_hausa.decode(tokenizer_hausa.encode("Haƙiƙa bincikenka zai haifar da ɗa mai ido."))

'<|startoftranscript|><|ha|><|transcribe|><|notimestamps|>Haƙiƙa bincikenka zai haifar da ɗa mai ido.<|endoftext|>'

In [13]:
special_hausa_vocab = ["ɓ", "ƙ", "ɗ", "ƴ"]

In [14]:
tokenizer_hausa.bpe("Haƙiƙa bincikenka zai haifar da ɗa mai ido.")

'Ha ƙ i ƙ a   bin ci ken ka   z ai   ha if ar   da   ɗ a   m ai   ido .'

Hausa special characters are not included when tokenizing, for some reason, but is included when using bpe.

In this article: https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt, it says that GPT 2 does byte-level BPE, which encodes the byte values, so all possible characters will be acounted for. So which one is used by the model?

Seeing as how the encode->decode returns the correct, original, text containing the special Hausa characters, I think the bpe is correct.

In [None]:
print(tokenizer_hausa.tokenize("Haƙiƙa bincikenka zai haifar da ɗa mai ido."))

"Ġ" is space in byte-level BPE gpt2 tokenizer (which is the same tokenizer for Whisper for English)

In [None]:
tokenizer_hausa.bpe("ƙaleen")

In [None]:
tokenizer_hausa.tokenize("ƙaleen")

Make sure normalization is done with basic normalizer: https://github.com/huggingface/transformers/issues/20703

This might be an issue later for languages like Yoruba. Edit: Might not be an issue? It looks like by default, text is not normalized.

In [None]:
tokenizer_hausa._normalize("Ça va?")

Tokenizer vocab

In [None]:
print(f"Whisper tokenizer vocab has {len(tokenizer_hausa.get_vocab().keys())} tokens\n{tokenizer_hausa.get_vocab()}")

In [None]:
tokenizer_hausa.get_added_vocab()

In [None]:
special_hausa_vocab

Look into adding new tokens (and possibly finetuning the model to learn contextual embeddings for these tokens?): https://medium.com/@pierre_guillou/nlp-how-to-add-a-domain-specific-vocabulary-new-tokens-to-a-subword-tokenizer-already-trained-33ab15613a41

In [None]:
tokenizer_hausa.bpe("Ina zama a wani ƙaramin ƙauye kilo mita hamsin tsakaninsu da birni.")

In [None]:
print(tokenizer_hausa.tokenize("Ina zama a wani ƙaramin ƙauye kilo mita hamsin tsakaninsu da birni."))

In [None]:
tokenizer_hausa.bpe('Æ')

In [None]:
tokenizer_hausa.tokenize('Æ')

In [None]:
tokenizer_hausa.unk_token

In [None]:
'Æ'.encode("utf-8")

In [None]:
'ƙ'.encode("utf-8")

In [None]:
tokenizer_hausa.add_tokens(tokenizer_hausa.bpe("Ina zama a wani ƙaramin ƙauye kilo mita hamsin tsakaninsu da birni.").split(" "))

In [None]:
tokenizer_hausa.get_added_vocab()

In [None]:
tokenizer_hausa.tokenize("Ina zama a wani ƙaramin ƙauye kilo mita hamsin tsakaninsu da birni.")

In [None]:
tokenizer_hausa.get_vocab()

In [None]:
tokenizer_hausa.get_vocab()["ƙ"]

# Trying to understand tokenizer (byte-level BPE) done in Whisper model

In [None]:
import re

def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

def _tokenize(text):
    """Tokenize a string."""
    bpe_tokens = []
    for token in re.findall(tokenizer_hausa.pat, text):
        token = "".join(
            bytes_to_unicode[b] for b in token.encode("utf-8")
        )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
        bpe_tokens.extend(bpe_token for bpe_token in tokenizer_hausa.bpe(token).split(" "))
    return bpe_tokens

In [None]:
tokenizer_hausa.pat

In [None]:
_tokenize("Ina zama a wani ƙaramin ƙauye kilo mita hamsin tsakaninsu da birni.")

In [None]:
tokenizer_hausa.tokenize("ƙ")

In [None]:
ord('ƙ')

In [None]:
list(range(ord("!"), ord("~") + 1))

In [None]:
'ƙ'.encode("utf-8")

In [None]:
len(bytes_to_unicode())

In [None]:
bytes_to_unicode()