In [2]:
import whisper

In [3]:
model = whisper.load_model("large-v3")

In [4]:
TESTF = "/home/joregan/Listening_test/00_4.wav"

In [5]:
audio = whisper.load_audio(TESTF)
audio = whisper.pad_or_trim(audio)

In [6]:
mel = whisper.log_mel_spectrogram(audio, n_mels=128).to(model.device)

In [7]:
mel.shape

torch.Size([128, 3000])

In [9]:
res = model.transcribe(TESTF, verbose=True)

Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: Swedish
[00:00.000 --> 00:03.100]  Tror du att jag är engelsman eller amerikan?


In [10]:
res

{'text': ' Tror du att jag är engelsman eller amerikan?',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 3.1,
   'text': ' Tror du att jag är engelsman eller amerikan?',
   'tokens': [50365,
    1765,
    284,
    1581,
    951,
    6368,
    3775,
    1741,
    1625,
    1601,
    12519,
    16116,
    16048,
    30,
    50520],
   'temperature': 0.0,
   'avg_logprob': -0.04304603487253189,
   'compression_ratio': 0.8490566037735849,
   'no_speech_prob': 0.1451009064912796}],
 'language': 'sv'}

In [13]:
len(res["segments"][0]["tokens"])

15

In [8]:
language_tokens, probs = model.detect_language(mel)

In [43]:
probs

{'ka': 1.1760527485193961e-07,
 'uk': 0.00011050760076614097,
 'ps': 4.126469832499424e-07,
 'ba': 4.256304642069608e-09,
 'af': 1.924401431097067e-06,
 'kn': 5.272009957479895e-07,
 'so': 5.345439646475825e-09,
 'ne': 5.848129376317956e-07,
 'ja': 0.0008843151153996587,
 'mk': 3.5319104085829167e-07,
 'lo': 4.839035909753875e-07,
 'hi': 0.00012629431148525327,
 'zh': 0.0011262395419180393,
 'lv': 1.6683612557244487e-05,
 'et': 1.52700231410563e-05,
 'az': 8.513211469107773e-06,
 'sd': 7.044382641652192e-07,
 'ml': 0.00019973318558186293,
 'no': 0.02231231890618801,
 'is': 0.00010205624130321667,
 'ln': 9.586413796114357e-08,
 'yi': 2.436002887407085e-06,
 'ar': 0.0004886251408606768,
 'ur': 1.1246702342759818e-05,
 'tl': 0.0005676936125382781,
 'bn': 3.0907585824024864e-06,
 'am': 6.384588857599738e-08,
 'mt': 1.5055169342303998e-06,
 'yo': 6.955768867555889e-07,
 'id': 0.00020396412583068013,
 'tt': 8.788865635267484e-09,
 'la': 0.0005797904450446367,
 'mr': 6.714851110700693e-07,
 '

In [46]:
print("Swedish", probs['sv'], "English", probs["en"])

Swedish 0.9029163122177124 English 0.038634639233350754


In [19]:
from whisper.decoding import detect_language

In [40]:
from whisper.tokenizer import LANGUAGES

In [42]:
len(LANGUAGES)

100

In [None]:
# https://discuss.huggingface.co/t/language-detection-with-whisper/26003/2
def detect_language(model: WhisperForConditionalGeneration, tokenizer: WhisperTokenizer, input_features,
                    possible_languages: Optional[Collection[str]] = None) -> List[Dict[str, float]]:
    # hacky, but all language tokens and only language tokens are 6 characters long
    language_tokens = [t for t in tokenizer.additional_special_tokens if len(t) == 6]
    if possible_languages is not None:
        language_tokens = [t for t in language_tokens if t[2:-2] in possible_languages]
        if len(language_tokens) < len(possible_languages):
            raise RuntimeError(f'Some languages in {possible_languages} did not have associated language tokens')

    language_token_ids = tokenizer.convert_tokens_to_ids(language_tokens)

    # 50258 is the token for transcribing
    logits = model(input_features,
                   decoder_input_ids = torch.tensor([[50258] for _ in range(input_features.shape[0])])).logits
    mask = torch.ones(logits.shape[-1], dtype=torch.bool)
    mask[language_token_ids] = False
    logits[:, :, mask] = -float('inf')

    output_probs = logits.softmax(dim=-1).cpu()
    return [
        {
            lang: output_probs[input_idx, 0, token_id].item()
            for token_id, lang in zip(language_token_ids, language_tokens)
        }

In [18]:
from typing import Tuple, List
import numpy as np
import torch
from torch import Tensor
from whisper.tokenizer import Tokenizer, get_tokenizer
from whisper.model import Whisper

@torch.no_grad()
def detect_language(
    model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None
) -> Tuple[Tensor, List[dict]]:
    """
    Detect the spoken language in the audio, and return them as list of strings, along with the ids
    of the most probable language tokens and the probability distribution over all language tokens.
    This is performed outside the main decode loop in order to not interfere with kv-caching.

    Returns
    -------
    language_tokens : Tensor, shape = (n_audio,)
        ids of the most probable language tokens, which appears after the startoftranscript token.
    language_probs : List[Dict[str, float]], length = n_audio
        list of dictionaries containing the probability distribution over all languages.
    """
    if tokenizer is None:
        tokenizer = get_tokenizer(
            model.is_multilingual, num_languages=model.num_languages
        )
    if (
        tokenizer.language is None
        or tokenizer.language_token not in tokenizer.sot_sequence
    ):
        raise ValueError(
            "This model doesn't have language tokens so it can't perform lang id"
        )

    single = mel.ndim == 2
    if single:
        mel = mel.unsqueeze(0)

    # skip encoder forward pass if already-encoded audio features were given
    if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
        mel = model.encoder(mel)

    # forward pass using a single token, startoftranscript
    n_audio = mel.shape[0]
    x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
    logits = model.logits(x, mel)[:, 0]

    # collect detected languages; suppress all non-language tokens
    mask = torch.ones(logits.shape[-1], dtype=torch.bool)
    mask[list(tokenizer.all_language_tokens)] = False
    logits[:, mask] = -np.inf
    language_tokens = logits.argmax(dim=-1)
    language_token_probs = logits.softmax(dim=-1).cpu()
    print(range(n_audio))
    language_probs = [
        {
            c: language_token_probs[i, j].item()
            for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
        }
        for i in range(n_audio)
    ]

    if single:
        language_tokens = language_tokens[0]
        language_probs = language_probs[0]

    return language_tokens, language_probs


In [19]:
detect_language(model, mel=mel)

range(0, 1)


(tensor(50273, device='cuda:0'),
 {'uz': 1.7174780486683972e-09,
  'ht': 6.1525997807621025e-06,
  'fr': 0.0012591310078278184,
  'yi': 2.4405899239354767e-06,
  'mk': 3.538264650160272e-07,
  'cs': 8.614607941126451e-05,
  'mg': 3.969078399279624e-09,
  'ta': 3.23603417200502e-05,
  'yo': 6.964402814446657e-07,
  'zh': 0.0011274180142208934,
  'mi': 2.076449163723737e-05,
  'pa': 5.417733063950436e-06,
  'nn': 0.004780774936079979,
  'id': 0.00020417656924109906,
  'ru': 0.003940355498343706,
  'sq': 4.905341484118253e-06,
  'tl': 0.0005684708594344556,
  'ln': 9.602927519836157e-08,
  'be': 4.160949174547568e-06,
  'lt': 4.7240453568520024e-05,
  'sw': 6.595484137505991e-06,
  'br': 2.3454422262148e-05,
  'fo': 5.391081231209682e-06,
  'et': 1.5284851542674005e-05,
  'as': 4.777654893928229e-08,
  'sa': 1.0819299859576859e-05,
  'th': 0.00018940492009278387,
  'mt': 1.5074173234097543e-06,
  'jw': 0.00011716379958670586,
  'ba': 4.263531305781498e-09,
  'sr': 2.2764468212699285e-06,
