## Settings


In [None]:
from pathlib import Path

model_folder = Path("/Users/hudsonmendes/Models/pretrained")
audio_folder = Path("/Users/hudsonmendes/Workspaces/hudsonmendes-estudos/cm3065-isp/exercise-04/files/Ex4_audio_files")


## Dependencies


In [None]:
from typing import Dict, List

import os
import numpy as np


# Data

In [None]:
def collect_files_per_locale(folder: Path) -> Dict[str, List[Path]]:
    files_per_locale = {}
    for root, _, filenames in os.walk(folder):
        for filename in filenames:
            if filename.endswith(".wav"):
                filepath = os.path.join(root, filename)
                locale = filepath.split("/")[-2].lower().strip()
                files_per_locale.setdefault(locale, []).append(Path(filepath))
    return files_per_locale


In [None]:
files_per_locale = collect_files_per_locale(folder=audio_folder)
files_per_locale


# ASR


## Mozilla DeepSpeech


In [None]:
import deepspeech as ds
import librosa as lr


class DeepSpeechASR:
    model: ds.Model

    def __init__(self, model_name: str, folder: Path, scorer_name: str = None):
        model_path = folder / f"{model_name}.pbmm"
        self.model = ds.Model(str(model_path))
        if not scorer_name:
            scorer_name = model_name
        scorer_path = folder / f"{scorer_name}.scorer"
        if scorer_path.is_file():
            self.model.enableExternalScorer(str(scorer_path))

    def transcribe(self, filepath: Path):
        audiofile = lr.load(filepath, sr=self.model.sampleRate())[0]
        audiofile = (audiofile * 32767).astype(np.int16)
        return self.model.stt(audiofile)


asr_deepspeech_en = DeepSpeechASR(model_name="deepspeech-0.9.3-models", folder=model_folder / 'deepspeech')
asr_deepspeech_it = DeepSpeechASR(model_name="output_graph_it", folder=model_folder / 'deepspeech')
asr_deepspeech_es = DeepSpeechASR(model_name="output_graph_es", scorer_name="kenlm_es", folder=model_folder / 'deepspeech')

(
    ("en", asr_deepspeech_en.transcribe(filepath=files_per_locale["en"][0])),
    ("it", asr_deepspeech_it.transcribe(filepath=files_per_locale["it"][0])),
    ("es", asr_deepspeech_es.transcribe(filepath=files_per_locale["es"][0]))
)


## SpeechBrain

In [None]:
from speechbrain.pretrained import EncoderDecoderASR

class SpeechBrainASR:
    def __init__(self, model_name: str, folder: Path):
        self.model = EncoderDecoderASR.from_hparams(
            source=f"speechbrain/{model_name}",
            savedir=folder)

    def transcribe(self, filepath: Path) -> str:
        self.model.transcribe_file(str(filepath))

asr_speechbrain_en = SpeechBrainASR(model_name="asr-wav2vec2-commonvoice-en", folder=model_folder / 'speechbrain')
asr_speechbrain_en.transcribe(filepath=files_per_locale["en"][0])


## Facebook Wav2Vec

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import soundfile as sf

class FacebookWave2VecASR:

    def __init__(self):
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    def transcribe(self, filepath: Path) -> str:
        with filepath.open('rb') as fh:
            data, sr = sf.read(fh)
            inputs = self.processor([data], sampling_rate=sr, return_tensors="pt")
            with torch.no_grad():
                y = self.model(**inputs)
                logits = y.logits
                ids = torch.argmax(logits, dim=-1)
                return self.processor.batch_decode(ids)

asr_fbw2v_en = FacebookWave2VecASR()
asr_fbw2v_en.transcribe(filepath=files_per_locale["en"][0])

# Analysis
