In [1]:
!pip install torch transformers accelerate



In [1]:
import torch
import accelerate
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-base"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype,
    use_safetensors=True
)
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 512)
      (layers): ModuleList(
        (0-5): 6 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=False)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          

In [3]:
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
import sounddevice as sd
from scipy.io.wavfile import write
import ffmpeg

In [5]:
def recordaudio(filename,duration=3,fs=44100):
    print('recording...')
    recording=sd.rec(int(duration*fs),samplerate=fs,channels=1)
    sd.wait() #wait for recording to finish
    print('done waiting')
    write(filename,fs,recording) #wave WAV file
    print('written to file')
    result=pipe(filename,generate_kwargs={'language':'en'})
    print(f'finished recording, file saved as {filename}')
    print(result)
    print(result['text'])

In [7]:
recordaudio('recordings/test.wav')

recording...
done waiting
written to file
finished recording, file saved as recordings/test.wav
{'text': " Hello, I'm Ying Yu.", 'chunks': [{'timestamp': (0.0, 2.0), 'text': " Hello, I'm Ying Yu."}]}
 Hello, I'm Ying Yu.


In [13]:
#Backup stuff below

In [31]:
result=pipe('/Users/jacintogomez/PycharmProjects/whisper/recordings/test.wav',generate_kwargs={'language':'en'})

In [32]:
print(result)

{'text': ' it', 'chunks': [{'timestamp': (0.0, 3.0), 'text': ' it'}]}


In [34]:
recordaudio("test.wav", duration=3)

recording...
done waiting
done writing
 I'm trying something now.
finished recording, file saved as test.wav
