In [1]:
import os
import torch
import torchaudio
import pandas as pd
from evaluate import load
from tqdm.notebook import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor

In [2]:
DATASET_PATH = "../data/common_voice"

HOTWORDS = ["be careful", "destroy", "stranger"]

In [3]:
# load the finetuned model from task 3
model = Wav2Vec2ForCTC.from_pretrained('../asr-train/wav2vec2-large-960h-cv')
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-large-960h')

In [4]:
df = pd.read_csv("../data/common_voice/cv-valid-dev.csv")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

hotword_filenames = []

for filename in tqdm(df['filename']):
    filepath = os.path.join(DATASET_PATH, 'cv-valid-dev', filename)
    waveform, sample_rate = torchaudio.load(filepath)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    # Convert to 1D mono waveform from stereo if necessary
    # torchaudio returns shape [channels, time]
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0)  # convert to mono
    else:
        waveform = waveform.squeeze(0)

    # Extract features
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

    # Move inputs to GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits

    # Decode prediction to text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    transcription = ''.join(transcription).lower()

    for hotword in HOTWORDS:
        if hotword in transcription.split(): # we assume that we want an exact match of the word
            print(f"Found hotword: {hotword} | {transcription} | {filename}")
            hotword_filenames.append(filename)    

  0%|          | 0/4076 [00:00<?, ?it/s]

Found hotword: stranger | be careful with your prognostications said the stranger | cv-valid-dev/sample-000000.mp3
Found hotword: stranger | the stranger seemed satisfied with the answer | cv-valid-dev/sample-000089.mp3
Found hotword: stranger | i had to test your courage the stranger said | cv-valid-dev/sample-000508.mp3
Found hotword: stranger | i had to test your courage the stranger said | cv-valid-dev/sample-000674.mp3
Found hotword: stranger | be careful with your prognostications said the stranger | cv-valid-dev/sample-001093.mp3
Found hotword: stranger | the stranger was speaking of things that very few people knew about | cv-valid-dev/sample-001101.mp3
Found hotword: stranger | the stranger was speaking of things that very few people knew about | cv-valid-dev/sample-001243.mp3
Found hotword: stranger | i had to test your courage the stranger said | cv-valid-dev/sample-001501.mp3
Found hotword: stranger | the stranger seemed satisfied with the answer | cv-valid-dev/sample-00193

In [5]:
with open('detected.txt', 'w') as f:
    for item in hotword_filenames:
        f.write(f"{item}\n")