In [1]:
from transformers import pipeline
from datasets import load_dataset

In [6]:
generator = pipeline(task="automatic-speech-recognition", model="microsoft/speecht5_asr")

In [3]:
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
example = dataset[40]

Found cached dataset librispeech_asr_demo (/export/home/lium/bdos/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [7]:
transcription = generator(example["audio"]["array"])



In [8]:
transcription

{'text': 'a man said to the universe sir i exist'}

## Alternative

In [23]:
from transformers import SpeechT5Processor, SpeechT5ForSpeechToText, SpeechT5ForTextToSpeech
import torch
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

In [9]:
sampling_rate = dataset.features["audio"].sampling_rate
inputs = processor(audio=example["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [10]:
predicted_ids = model.generate(**inputs, max_length=100)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [11]:
transcription

['a man said to the universe sir i exist']

In [21]:
with torch.no_grad():
    out = model.speecht5.encoder(**inputs)

In [22]:
out.last_hidden_state.requires_grad

False

In [24]:
model_text = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

In [25]:
example["text"]

'A MAN SAID TO THE UNIVERSE SIR I EXIST'

In [26]:
inputs_text = processor(text=example["text"], return_tensors="pt")

In [31]:
with torch.no_grad():
    out_text = model_text.speecht5.encoder(inputs_text.input_ids)

In [32]:
out_text.last_hidden_state.shape

torch.Size([1, 40, 768])

In [28]:
inputs_text

{'input_ids': tensor([[ 4, 33,  4, 43, 33, 48,  4, 34, 33, 30, 51,  4, 32, 50,  4, 32, 35, 47,
          4, 61, 48, 30, 62, 47, 54, 34, 47,  4, 34, 30, 54,  4, 30,  4, 47, 69,
         30, 34, 32,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [29]:
model_text.speecht5.encoder

SpeechT5EncoderWithTextPrenet(
  (prenet): SpeechT5TextEncoderPrenet(
    (embed_tokens): Embedding(81, 768, padding_idx=1)
    (encode_positions): SpeechT5ScaledPositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (wrapped_encoder): SpeechT5Encoder(
    (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x SpeechT5EncoderLayer(
        (attention): SpeechT5Attention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (feed_forward): SpeechT5FeedForward(
          (intermediate_d

In [30]:
model.speecht5.encoder

SpeechT5EncoderWithSpeechPrenet(
  (prenet): SpeechT5SpeechEncoderPrenet(
    (feature_encoder): SpeechT5FeatureEncoder(
      (conv_layers): ModuleList(
        (0): SpeechT5GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x SpeechT5NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x SpeechT5NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): SpeechT5FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace