<a href="https://colab.research.google.com/github/dodorlee1210/deepfake_audio_detection/blob/main/WhisperIntro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download

In [3]:
# https://github.com/piotrkawa/deepfake-whisper-features

In [4]:
# install whisper model
!pip install git+https://github.com/openai/whisper.git

# jiwer is python package used for ASR evaluation
!pip install jiwer

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-u77jflp8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-u77jflp8
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20240930)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-

In [5]:
import os
import numpy as np
try:
    import tensorflow
except ImportError:
    pass

import torch
import pandas as pd
import whisper

# torchaudio provides easy access to common, publicly accessible datasets
import torchaudio

from tqdm.notebook import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Torchaudio Dataset

In [6]:
# OpenAI example used LibriSpeech, but I use LibriLightLimited
# Most recent dataset, 2020
# https://pytorch.org/audio/2.5.0/generated/torchaudio.datasets.LibriLightLimited.html#torchaudio.datasets.LibriLightLimited

class LibriLightLimited(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriLightLimited and trim/pad the audio to 30 seconds.
    """
    def __init__(self, subset="10min", device=DEVICE):
        self.dataset = torchaudio.datasets.LibriLightLimited(
            root=os.path.expanduser("~/.cache"),
            subset=subset,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)

        return (mel, text)

In [7]:
dataset = LibriLightLimited("10min")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)

100%|██████████| 570M/570M [00:09<00:00, 65.2MB/s]


# Running Inference on Data
### Takes about 7 min to finish 1/3

In [8]:
# Tiny Whisper Model, English
# https://huggingface.co/openai/whisper-tiny.en
# https://github.com/openai/whisper/blob/main/whisper/model.py

model = whisper.load_model("tiny.en")

100%|█████████████████████████████████████| 72.1M/72.1M [00:03<00:00, 24.6MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [27]:
options = whisper.DecodingOptions(language="en", without_timestamps=False)

hypotheses = []
references = []

# results are nested lists, must extend not append (appending had 3 results but extending had 48?)
for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

  0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,permission to accept such additional employmen...,PERMISSION TO ACCEPT SUCH ADDITIONAL EMPLOYMEN...
1,"When evening came, I prepared to step over in ...",WHEN EVENING CAME I PREPARED TO STEP OVER IN O...
2,I remember very well that before quitting my c...,I REMEMBER VERY WELL THAT BEFORE QUITTING MY C...
3,She is some stiff old maid. For though the dau...,SHE IS SOME STIFF OLD MAID FOR THOUGH THE DAUG...
4,"and no dressing can make me so, therefore I'll...",AND NO DRESSING CAN MAKE ME SO THEREFORE I'LL ...
5,With sunk dark eyes and a large square forehea...,WITH SUNK DARK EYES UNDER A LARGE SQUARE FOREH...
6,In a moment I had pulled the bow. In another m...,IN A MOMENT I HAD PULLED THE BELL IN ANOTHER M...
7,"through which I saw shrubs and a grass plot, l...",THROUGH WHICH I SAW SHRUBS AND A GRASS PLAT LO...
8,"The portraits, after having answered in the af...",THE PORTRESS AFTER HAVING ANSWERED IN THE AFFI...
9,"with a very well-painted, highly-varnished flo...",WITH A VERY WELL PAINTED HIGHLY VARNISHED FLOO...


# Word Error Rate

In [32]:
import jiwer
# https://github.com/openai/whisper/blob/main/whisper/normalizers/english.py
# https://www.restack.io/p/transformer-models-whisper-answer-normalizer-cat-ai
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [35]:
data["cleaned_hypothesis"] = [normalizer(text) for text in data["hypothesis"]]
data["cleaned_reference"] = [normalizer(text) for text in data["reference"]]

data

Unnamed: 0,hypothesis,reference,cleaned_hypothesis,cleaned_reference
0,permission to accept such additional employmen...,PERMISSION TO ACCEPT SUCH ADDITIONAL EMPLOYMEN...,permission to accept such additional employmen...,permission to accept such additional employmen...
1,"When evening came, I prepared to step over in ...",WHEN EVENING CAME I PREPARED TO STEP OVER IN O...,when evening came i prepared to step over in o...,when evening came i prepared to step over in o...
2,I remember very well that before quitting my c...,I REMEMBER VERY WELL THAT BEFORE QUITTING MY C...,i remember very well that before quitting my c...,i remember very well that before quitting my c...
3,She is some stiff old maid. For though the dau...,SHE IS SOME STIFF OLD MAID FOR THOUGH THE DAUG...,she is some stiff old maid for though the daug...,she is some stiff old maid for though the daug...
4,"and no dressing can make me so, therefore I'll...",AND NO DRESSING CAN MAKE ME SO THEREFORE I'LL ...,and no dressing can make me so therefore i wil...,and no dressing can make me so therefore i wil...
5,With sunk dark eyes and a large square forehea...,WITH SUNK DARK EYES UNDER A LARGE SQUARE FOREH...,with sunk dark eyes and a large square forehea...,with sunk dark eyes under a large square foreh...
6,In a moment I had pulled the bow. In another m...,IN A MOMENT I HAD PULLED THE BELL IN ANOTHER M...,in a moment i had pulled the bow in another mo...,in a moment i had pulled the bell in another m...
7,"through which I saw shrubs and a grass plot, l...",THROUGH WHICH I SAW SHRUBS AND A GRASS PLAT LO...,through which i saw shrubs and a grass plot lo...,through which i saw shrubs and a grass plat lo...
8,"The portraits, after having answered in the af...",THE PORTRESS AFTER HAVING ANSWERED IN THE AFFI...,the portraits after having answered in the aff...,the portress after having answered in the affi...
9,"with a very well-painted, highly-varnished flo...",WITH A VERY WELL PAINTED HIGHLY VARNISHED FLOO...,with a very well painted highly varnished floo...,with a very well painted highly varnished floo...


In [36]:
wer = jiwer.wer(list(data["cleaned_reference"]), list(data["cleaned_hypothesis"]))
print(f"WER: {wer * 100:.2f}%")

WER: 4.77%
