In [4]:
import torch
from transformers import pipeline
import base64
from scipy.io import wavfile
import io
import numpy as np
from transformers.utils import is_flash_attn_2_available

class Transcriber:
    def __init__(self, model_name="openai/whisper-tiny  ", device=None):
        self.device = device if device else "cuda:0" if torch.cuda.is_available() else "cpu"
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=model_name,
            chunk_length_s=30,
            device=self.device,
            model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
        )

    def transcribe(self, audio_array, sampling_rate, batch_size=8):
        # Format the audio array to match the input format expected by the pipeline
        audio_dict = {"array": audio_array, "sampling_rate": sampling_rate}
        # Transcribe using the pipeline
        transcription = self.pipe(audio_dict, batch_size=batch_size)["text"]
        return transcription

    def encode_audio(self, audio_file):
        with open(audio_file, 'rb') as audio_file:
            encoded_audio = base64.b64encode(audio_file.read())
        return encoded_audio

    def decode_audio_to_np_array(self, encoded_audio):
        decoded_audio = base64.b64decode(encoded_audio)
        sr, audio = wavfile.read(io.BytesIO(decoded_audio))
        audio = audio.astype(np.float32)
        audio = audio / np.max(np.abs(audio))
        return sr, audio

# Usage Example
# transcriber = Transcriber()
# sr, audio_array = transcriber.decode_audio_to_np_array(encoded_audio)
# transcription = transcriber.transcribe(audio_array, sr)


In [5]:
# Initialize the Transcriber
transcriber = Transcriber()


In [6]:
import time

start = time.time()

# Path to the WAV file
wav_file = "medium.wav"

# Convert wav file to base64
encoded_audio = transcriber.encode_audio(wav_file)

# Convert base64 to numpy array and get sampling rate
sampling_rate, audio_np_array = transcriber.decode_audio_to_np_array(encoded_audio)

# Transcribe the audio
text = transcriber.transcribe(audio_np_array, sampling_rate)

end = time.time()

print("Time taken: ", end - start)
print(text)

# Save text to file
with open("small.txt", "w") as text_file:
    text_file.write(text)

Time taken:  9.825674533843994
 Henry F. Phillips from Wikipedia, the free encyclopedia at EN. Wikipedia.org Henry F. Phillips from Wikipedia, the free encyclopedia Henry F. Phillips, 1890 to 1958, a US businessman from Portland, Oregon, has the honor of having the Phillips-head screw and screwdriver named after him. The importance of the cross-head screw design lies in its self itself centering property, useful on automated production lines that use powered screw drivers. Phillips major contribution was in driving the crosshead concept forward to the point where it was adopted by screw makers and automobile companies. Although he received patents for the design in 1936, US patent number 2,046,343, US patents 2,046,837 to 2,046,837 to $2,046,840. It was so widely copied that by 1949, Phillips lost his patent. The American screw company was responsible for devising a means of manufacturing the screw and successfully patented and licensed their method. Other screw makers of the 1930s dis