In [None]:
%pip install datasets huggingface-hub datasets torch transformers accelerate librosa soundfile

Collecting pytorch
  Using cached pytorch-1.0.2.tar.gz (689 bytes)
Collecting transformers
  Using cached transformers-4.30.2-py3-none-any.whl (7.2 MB)
Collecting accelerate
  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
Collecting regex!=2019.12.17
  Using cached regex-2024.4.16-cp37-cp37m-macosx_10_9_x86_64.whl (297 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp37-cp37m-macosx_10_11_x86_64.whl (4.0 MB)
Collecting safetensors>=0.3.1
  Using cached safetensors-0.5.3.tar.gz (67 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting torch>=1.6.0
  Using cached torch-1.13.1-cp37-none-macosx_10_9_x86_64.whl (135.3 MB)
Using legacy setup.py install for pytorch, since package 'wheel' is not installed.
Building wheels for collected packages: safetensors
  Building wheel for safetensors (PEP 517) ... [?25lerror
[31m  ERROR: Comm

In [1]:
# Import required libraries
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import librosa
import soundfile

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

# Load model and processor with error handling
try:
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, 
        torch_dtype=torch_dtype, 
        low_cpu_mem_usage=True, 
        use_safetensors=True
    )
    model.to(device)
    
    # Try to load processor, fallback to manual tokenizer/feature_extractor if needed
    try:
        processor = AutoProcessor.from_pretrained(model_id)
        tokenizer = processor.tokenizer
        feature_extractor = processor.feature_extractor
    except Exception as e:
        print(f"AutoProcessor failed: {e}")
        print("Loading tokenizer and feature extractor separately...")
        from transformers import WhisperTokenizer, WhisperFeatureExtractor
        tokenizer = WhisperTokenizer.from_pretrained(model_id)
        feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
    
    # Create pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=tokenizer,
        feature_extractor=feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )
    
    print("Model and pipeline loaded successfully!")
    
except Exception as e:
    print(f"Error loading model: {e}")
    print("Trying with a different Whisper model...")
    
    # Fallback to a more stable model
    model_id = "openai/whisper-base"
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, 
        torch_dtype=torch_dtype, 
        low_cpu_mem_usage=True
    )
    model.to(device)
    
    processor = AutoProcessor.from_pretrained(model_id)
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )
    print("Fallback model loaded successfully!")


AutoProcessor failed: Wrong index found for <|0.02|>: should be None but found 50366.
Loading tokenizer and feature extractor separately...
Error loading model: Wrong index found for <|0.02|>: should be None but found 50366.
Trying with a different Whisper model...
Fallback model loaded successfully!


In [8]:
# Test the pipeline with sample data
try:
    # Load a small dataset for testing
    dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation[:1]")
    sample = dataset[0]["audio"]

    print(sample)
    
    print("Processing audio sample...")
    result = pipe(sample)
    print(f"Transcription: {result['text']}")
except Exception as e:
    print(f"Error processing audio: {e}")


Found cached dataset parquet (/Users/henry/.cache/huggingface/datasets/distil-whisper___parquet/clean-8d6dc0c0993e781e/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


{'path': '0d38672e0bbdbdc460af55b8bb84a15b2730db2819f2af64f9c777d4d586f2de', 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00024414, 0.00048828,
       0.0005188 ]), 'sampling_rate': 16000}
Processing audio sample...




Transcription:  Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly he's drawn from eating and its results occur most readily to the mind. He has graved doubts whether Sir Frederick Layton's work is really Greek after all, and can discover


In [7]:
sample

{}

In [None]:
e