# 1. Installing the Libraries

In [None]:
pip install transformers==4.28.1 soundfile sentencepiece torchaudio pydub

# 2. Importing

In [None]:
from transformers import *  # Used to generate features
import torch                # Used to create deep neural networks
import soundfile as sf      # Can read & write audio files
import os                   # Functions to interact with the OS of the device
import torchaudio           # Used for Audio signal processing with PyTorch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# 3. Creating the wav2vec2 model

In [26]:
# We've selected the large pretrained dataset

wav2vec2_model_name = "facebook/wav2vec2-large-960h-lv60-self" # pretrained 1.26GB

wav2vec2_processor = Wav2Vec2Processor.from_pretrained(wav2vec2_model_name)
wav2vec2_model = Wav2Vec2ForCTC.from_pretrained(wav2vec2_model_name).to(device)

loading configuration file preprocessor_config.json from cache at C:\Users\hiakh/.cache\huggingface\hub\models--facebook--wav2vec2-large-960h-lv60-self\snapshots\54074b1c16f4de6a5ad59affb4caa8f2ea03a119\preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

loading configuration file config.json from cache at C:\Users\hiakh/.cache\huggingface\hub\models--facebook--wav2vec2-large-960h-lv60-self\snapshots\54074b1c16f4de6a5ad59affb4caa8f2ea03a119\config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-large-960h-lv60-self",
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token

All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Preparing the audio file
# Load the audio file & resampling the file to 16,000 (since the dataset is trained at 16,000 sampling rate)`

def load_audio(audio_path):
  speech, sr = torchaudio.load(audio_path)
  resampler = torchaudio.transforms.Resample(sr, 16000)
  speech = resampler(speech)
  return speech.squeeze()

# Make a single transcription function that calls the model & its processor to transcribe

def get_transcription_wav2vec2(audio_path, model, processor):
  speech = load_audio(audio_path)
  input_features = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)
  # perform inference
  logits = model(input_features)["logits"]
  # use argmax to get the predicted IDs
  predicted_ids = torch.argmax(logits, dim=-1)
  transcription = processor.batch_decode(predicted_ids)[0]
  return transcription.lower()

In [33]:
# Testing the function is working correctly

print("Wav2vec2:", get_transcription_wav2vec2("sample_audios/short_audio3.wav", wav2vec2_model, wav2vec2_processor))

Wav2vec2: and missus goddard three ladies almost always at the service of an invitation from hartfield and who were fetched and carried home so often that mister woodhouse thought it no hardship for either james or the horses had it taken place only once a year it would have been a grievance


# 4. Creating the Whisper model

In [15]:
# We've selected the medium-sized model for the Whisper toolkit. The larger model is 6 GB+

whisper_model_name = "openai/whisper-medium" # multilingual, ~ 3.06 GB

# load the whisper model and tokenizer
whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)

loading configuration file preprocessor_config.json from cache at C:\Users\hiakh/.cache\huggingface\hub\models--openai--whisper-medium\snapshots\8b6593e88fc558a10e78c6e5fa65311eddaa7c2a\preprocessor_config.json
Feature extractor WhisperFeatureExtractor {
  "chunk_length": 30,
  "feature_extractor_type": "WhisperFeatureExtractor",
  "feature_size": 80,
  "hop_length": 160,
  "n_fft": 400,
  "n_samples": 480000,
  "nb_max_frames": 3000,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "WhisperProcessor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

loading file vocab.json from cache at C:\Users\hiakh/.cache\huggingface\hub\models--openai--whisper-medium\snapshots\8b6593e88fc558a10e78c6e5fa65311eddaa7c2a\vocab.json
loading file tokenizer.json from cache at C:\Users\hiakh/.cache\huggingface\hub\models--openai--whisper-medium\snapshots\8b6593e88fc558a10e78c6e5fa65311eddaa7c2a\tokenizer.json
loading file merges.txt from cache at C:\Users\hiakh/.cache\h

All model checkpoint weights were used when initializing WhisperForConditionalGeneration.

All the weights of WhisperForConditionalGeneration were initialized from the model checkpoint at openai/whisper-medium.
If your task is similar to the task the model of the checkpoint was trained on, you can already use WhisperForConditionalGeneration for predictions without further training.
loading configuration file generation_config.json from cache at C:\Users\hiakh/.cache\huggingface\hub\models--openai--whisper-medium\snapshots\8b6593e88fc558a10e78c6e5fa65311eddaa7c2a\generation_config.json
Generate config GenerationConfig {
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "decoder_start_token_id": 50258,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      null
    ],
    [
      2,
      50359
    ]
  ],
  "is_multilingual": true,
  "lang_to_id": {
    "<|af|>": 50327,
    "<|am|>": 50334,
    "<|ar|>": 50272,
    "<|as|>": 50350,
    "<|az

In [16]:
# Transcription function for the Whisper Model

def get_transcription_whisper(audio_path, model, processor, language="english", skip_special_tokens=True):
  # resample from whatever the audio sampling rate to 16000
  speech = load_audio(audio_path)
  # get the input features
  input_features = processor(speech, return_tensors="pt", sampling_rate=16000).input_features.to(device)
  # get special decoder tokens for the language
  forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
  # perform inference
  predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
  # decode the IDs to text
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=skip_special_tokens)[0]
  return transcription

In [46]:
# Testing the Whisper model
# Testing the Urdu Support for Whisper Library

print("Whisper_Urdu_Test_audio1:", get_transcription_whisper("sample_audios/urdu_test_1.mp3", whisper_model, whisper_processor, language="urdu"), "\n")
print("Whisper_Urdu_Test_audio2:", get_transcription_whisper("sample_audios/UrduRecording.wav", whisper_model, whisper_processor, language="urdu"), "\n")
print("Whisper_English_Test_audio1:", get_transcription_whisper("sample_audios/short_audio3.wav", whisper_model, whisper_processor), "\n")

Whisper_Urdu_Test_audio1:  انگریزی پاکستان کی سرکاری زبان ہے تمام معاہدے اور سرکاری کام انگریزی زبان میں ہی تیکیے جاتے ہیں جبکہ اردو پاکستان کی قومی زبان ہے 

Whisper_Urdu_Test_audio2:  میرا نام حسن امران ہے اور میں نیڈ یونوورسٹی کا طالب علم 

Whisper_English_Test_audio1:  and Mrs. Goddard, three ladies almost always at the service of an invitation from Hartfield, and who were fetched and carried home so often that Mr. Woodhouse sought it no hardship for either James or the horses. Had it taken place only once a year it would have been a grievance. 



In [37]:
# Testing the Whisper model

# Running an audio > 20 secs on Wav2vec2 will crash the browser tab
# Running a long audio on whisper will only return till 30 seconds. 

print("Whisper:", get_transcription_whisper("sample_audios/long_audio_1.wav", whisper_model, whisper_processor))
# print("Wav2vec2:", get_transcription_wav2vec2("sample_audios/long_audio_1.wav", wav2vec2_model, wav2vec2_processor))

Whisper:  The horse trotted around the field at a brisk pace. Find the twin who stole the pearl necklace. Cut the cord that binds the box tightly. The red tape bound the smuggled food. Look in the corner to find the tan shirt. The cold drizzle will halt.


# 5. Comparison of Wav2vec2 & Whisper model

In [45]:
# Comparison of both the models

print("Wav2vec2:", get_transcription_wav2vec2("sample_audios/lathe.wav", wav2vec2_model, wav2vec2_processor), "\n")
print("Whisper2:", get_transcription_whisper("sample_audios/lathe.wav", whisper_model, whisper_processor), "\n")
print("---------------------------------------------------------------")
print("Wav2vec2:", get_transcription_wav2vec2("sample_audios/short_audio1.wav", wav2vec2_model, wav2vec2_processor), "\n")
print("Whisper2:", get_transcription_whisper("sample_audios/short_audio1.wav", whisper_model, whisper_processor), "\n")
print("---------------------------------------------------------------")
print("Wav2vec2:", get_transcription_wav2vec2("sample_audios/short_audio2.wav", wav2vec2_model, wav2vec2_processor), "\n")
print("Whisper2:", get_transcription_whisper("sample_audios/short_audio2.wav", whisper_model, whisper_processor), "\n")

Wav2vec2: a late is a big tool grab every dish of sugar 

Whisper2:  A lathe is a big tool. Grab every dish of sugar. 

---------------------------------------------------------------
Wav2vec2: his abode which he had fixed at a bowery or country seat at a short distance from the city just at what is now called dutch street soon abounded with proofs of his ingenuity patent smoke jacks that required a horse to work them dutch ovens that roasted meat without fire carts that went before the horses weathercocks that turned against the wind and other wrong headed contrivances that astonished and confounded all beholders 

Whisper2:  His abode, which he had fixed at a bowery, or country seat, at a short distance from the city, just at what is now called Dutch Street, soon abounded with proofs of his ingenuity. Patent smoke-jacks that required a horse to work them, Dutch ovens that roasted meat without fire, carts that went before the horses, weather-cocks that turned against the wind, and oth

# 6. Recording From within the Notebook using PyAudio

In [53]:
# To record audios directly from Jupyter notebook

import pyaudio
import wave

chunk = 1024  # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16  # 16 bits per sample
channels = 1
fs = 16000  # Record at 16000 samples per second
seconds = 5
filename = "output.wav"

p = pyaudio.PyAudio()  # Create an interface to PortAudio

print('Recording')

stream = p.open(format=sample_format,
                channels=channels,
                rate=fs,
                frames_per_buffer=chunk,
                input=True)

frames = []  # Initialize array to store frames

# Store data in chunks for 3 seconds
for i in range(0, int(fs / chunk * seconds)):
    data = stream.read(chunk)
    frames.append(data)

# Stop and close the stream 
stream.stop_stream()
stream.close()
# Terminate the PortAudio interface
p.terminate()

print('Finished recording')

# Save the recorded data as a WAV file
wf = wave.open(filename, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(fs)
wf.writeframes(b''.join(frames))
wf.close()

Recording
Finished recording


## 6.1. Reading the recorded file 

In [54]:
# To read the audio file after recording

print("Whisper:", get_transcription_whisper("output.wav", whisper_model, whisper_processor))
print("Wav2vec2:", get_transcription_wav2vec2("output.wav", wav2vec2_model, wav2vec2_processor))

Whisper:  1, 2, 3 testing 1, 2, 3 this is a recorded file
Wav2vec2: two three testing one two three this is a recorded file


# 7. Transcribing longer audios (>30 seconds) with timestamps 

In [46]:
# Transcribing long audios with TimeStamps

# Initialize the pipeline API (built-in on python) to break the file into chunks

pipe = pipeline("automatic-speech-recognition", 
                model=whisper_model_name, device=device)

def get_long_transcription_whisper(audio_path, pipe, return_timestamps=True, chunk_length_s=10, stride_length_s=2):
    return pipe(load_audio(audio_path).numpy(), return_timestamps=return_timestamps, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s)

loading configuration file config.json from cache at C:\Users\hiakh/.cache\huggingface\hub\models--openai--whisper-medium\snapshots\8b6593e88fc558a10e78c6e5fa65311eddaa7c2a\config.json
Model config WhisperConfig {
  "_name_or_path": "openai/whisper-medium",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "apply_spec_augment": false,
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "classifier_proj_size": 256,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 24,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 24,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]


All model checkpoint weights were used when initializing WhisperForConditionalGeneration.

All the weights of WhisperForConditionalGeneration were initialized from the model checkpoint at openai/whisper-medium.
If your task is similar to the task the model of the checkpoint was trained on, you can already use WhisperForConditionalGeneration for predictions without further training.
loading configuration file generation_config.json from cache at C:\Users\hiakh/.cache\huggingface\hub\models--openai--whisper-medium\snapshots\8b6593e88fc558a10e78c6e5fa65311eddaa7c2a\generation_config.json
Generate config GenerationConfig {
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "decoder_start_token_id": 50258,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      null
    ],
    [
      2,
      50359
    ]
  ],
  "is_multilingual": true,
  "lang_to_id": {
    "<|af|>": 50327,
    "<|am|>": 50334,
    "<|ar|>": 50272,
    "<|as|>": 50350,
    "<|az

In [50]:
# get the transcription of a sample long audio file
output = get_long_transcription_whisper("sample_audios/long_audio_1.wav", pipe, chunk_length_s=10, stride_length_s=1)

for chunk in output["chunks"]:
  # print the timestamp and the text
  print(chunk["timestamp"], ":", chunk["text"])

(0.0, 6.0) :  The horse trotted around the field at a brisk pace.
(6.0, 12.8) :  Find the twin who stole the pearl necklace.
(12.8, 21.0) :  Cut the cord that binds the box tightly. The The red tape bound the smuggled food.
(21.0, 38.0) :  Look in the corner to find the tan shirt. The cold drizzle will halt the bond drive. Nine men were hired to dig the ruins.
(38.0, 58.0) :  The junkyard had a moldy smell. The flint sputtered and lit a pine torch. Soak the cloth and drown the sharp odor..
