## Settings


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

model_folder = Path("/Users/hudsonmendes/Models/pretrained")
data_folder = Path("/Users/hudsonmendes/Workspaces/hudsonmendes-estudos/cm3065-isp/exercise-04/files")
transcriptions_filepath = data_folder / "Ex4_audio_files_transcriptions.csv"


## Dependencies


In [3]:
from typing import Dict, List, Tuple

import re
import numpy as np
import pandas as pd
from tqdm import tqdm


# Data


In [None]:
import unicodedata


def normalise_text(text):
    text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
    text = re.sub("[^A-Za-z0-9\ ]+", "", text)
    text = re.sub("[\s]+", " ", text)
    return text.lower().strip()


In [None]:
test_df = pd.read_csv(transcriptions_filepath, header=0)
test_df["y_true"] = test_df.y_true.map(normalise_text)
test_df


# ASR


In [None]:
from abc import ABC, abstractclassmethod


class AbstractASR:
    @abstractclassmethod
    def transcribe(self, filepath: Path) -> str:
        raise NotImplementedError("You must implement the transcribe() method")


## Mozilla DeepSpeech


In [None]:
import deepspeech as ds
import librosa as lr


class DeepSpeechASR(AbstractASR):
    model: ds.Model

    def __init__(self, model_name: str, folder: Path, scorer_name: str = None):
        super(DeepSpeechASR, self).__init__()
        model_path = folder / f"{model_name}.pbmm"
        self.model = ds.Model(str(model_path))
        if not scorer_name:
            scorer_name = model_name
        scorer_path = folder / f"{scorer_name}.scorer"
        if scorer_path.is_file():
            self.model.enableExternalScorer(str(scorer_path))

    def transcribe(self, filepath: Path):
        audiofile = lr.load(filepath, sr=self.model.sampleRate())[0]
        audiofile = (audiofile * 32767).astype(np.int16)
        return self.model.stt(audiofile)


asr_deepspeech_en = DeepSpeechASR(model_name="deepspeech-0.9.3-models", folder=model_folder / "deepspeech")
asr_deepspeech_it = DeepSpeechASR(model_name="output_graph_it", folder=model_folder / "deepspeech")
asr_deepspeech_es = DeepSpeechASR(
    model_name="output_graph_es", scorer_name="kenlm_es", folder=model_folder / "deepspeech"
)

(
    ("en", asr_deepspeech_en.transcribe(filepath=data_folder / test_df[test_df.locale == "en"].filename.values[0])),
    ("it", asr_deepspeech_it.transcribe(filepath=data_folder / test_df[test_df.locale == "it"].filename.values[0])),
    ("es", asr_deepspeech_es.transcribe(filepath=data_folder / test_df[test_df.locale == "es"].filename.values[0])),
)


## SpeechBrain


In [None]:
from speechbrain.pretrained import EncoderDecoderASR


class SpeechBrainASR:
    def __init__(self, model_name: str, folder: Path):
        super(SpeechBrainASR, self).__init__()
        self.model = EncoderDecoderASR.from_hparams(source=f"speechbrain/{model_name}", savedir=folder)

    def transcribe(self, filepath: Path) -> str:
        return self.model.transcribe_file(str(filepath))


asr_speechbrain_en = SpeechBrainASR(model_name="asr-wav2vec2-commonvoice-en", folder=model_folder / "speechbrain")
asr_speechbrain_en.transcribe(filepath=data_folder / test_df[test_df.locale == "en"].filename.values[0])


## Facebook Wav2Vec


In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import soundfile as sf


class FacebookWave2VecASR:
    def __init__(self):
        super(FacebookWave2VecASR, self).__init__()
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    def transcribe(self, filepath: Path) -> str:
        with filepath.open("rb") as fh:
            data, sr = sf.read(fh)
            inputs = self.processor([data], sampling_rate=sr, return_tensors="pt")
            with torch.no_grad():
                y = self.model(**inputs)
                logits = y.logits
                ids = torch.argmax(logits, dim=-1)
                return self.processor.batch_decode(ids)[0]


asr_fbwav2vec_en = FacebookWave2VecASR()
asr_fbwav2vec_en.transcribe(filepath=data_folder / test_df[test_df.locale == "en"].filename.values[0])


# Transcription


## Deep Speech


In [None]:
def analyse(df: pd.DataFrame, folder: Path, asr_engine: AbstractASR) -> pd.DataFrame:
    df = df.copy()
    for ix, filename in df.filename.iteritems():
        y_pred = asr_engine.transcribe(filepath=folder / filename)
        df.at[ix, "y_pred"] = y_pred
    return df


In [None]:
results_deepspeech_en_df = analyse(test_df[test_df.locale == "en"], data_folder, asr_deepspeech_en)
results_deepspeech_en_df


In [None]:
results_deepspeech_it_df = analyse(test_df[test_df.locale == "it"], data_folder, asr_deepspeech_it)
results_deepspeech_it_df


In [None]:
results_deepspeech_es_df = analyse(test_df[test_df.locale == "es"], data_folder, asr_deepspeech_es)
results_deepspeech_es_df


## Comparative (DeepSpeech, SpeechBrain, FBWav2Vec)


In [None]:
def compare(df: pd.DataFrame, asr_engines: Dict[str, AbstractASR]) -> pd.DataFrame:
    df = df.copy()
    for ix, filename in tqdm(df.filename.iteritems(), total=len(df.filename)):
        for asr_name, asr_engine in asr_engines.items():
            y_pred = asr_engine.transcribe(filepath=data_folder / filename)
            df.at[ix, f"y_pred_{asr_name}"] = normalise_text(y_pred)
    return df


In [None]:
comparative_df = compare(
    df=test_df[test_df.locale == "en"],
    asr_engines={
        "deepspeech": asr_deepspeech_en,
        "speechbrain": asr_speechbrain_en,
        "fbwav2vec": asr_fbwav2vec_en,
    },
)
comparative_df


# Evaluation (WER, or "Word Error Rate")

Calculated using the following formula:

$WER = \frac{S + D+ I}{N} \times 100$

where:
* $S$ is the number of substitutions
* $D$ is the number of deletions
* $I$ is the number of insertions
* $N$ is the number of words in your "y_true"


In [None]:
class Evaluator:
    df: pd.DataFrame

    def __init__(self, df: pd.DataFrame):
        self.df = df

    def evaluate(self, asr_name: str = None) -> pd.DataFrame:
        y_pred_column = "y_pred"
        if asr_name:
            y_pred_column = f"{y_pred_column}_{asr_name}"
        s, d, i, n = Evaluator._calculate_sdi(self.df.y_true, self.df[y_pred_column])
        return (s + d + i) / n

    @staticmethod
    def _calculate_sdi(y_true_series: pd.Series, y_pred_series: pd.Series) -> Tuple[int, int, int, int]:
        s, d, i, n = 0, 0, 0, 0
        for y_true, y_pred in zip(y_true_series, y_pred_series):
            y_true = Evaluator._tokenise(y_true)
            y_pred = Evaluator._tokenise(y_pred)
            n += len(y_true)
            s += sum(1 for i, j in zip(y_true, y_pred) if i != j)
            d += sum(1 for i in y_true if i not in y_pred)
            i += sum(1 for i in y_pred if i not in y_true)
        return s, d, i, n

    @staticmethod
    def _tokenise(s: str) -> List[str]:
        return s.split(" ")


## Deep Speech


In [None]:
Evaluator(results_deepspeech_en_df).evaluate()

In [None]:
Evaluator(results_deepspeech_it_df).evaluate()

In [None]:
Evaluator(results_deepspeech_es_df).evaluate()

## Comparative (DeepSearch, SpeechBrain, FBWav2Vec)

In [None]:
Evaluator(comparative_df).evaluate(asr_name='deepspeech')

In [None]:
Evaluator(comparative_df).evaluate(asr_name='speechbrain')

In [None]:
Evaluator(comparative_df).evaluate(asr_name='fbwav2vec')

# Conclusions

