## Settings


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

model_folder = Path("/Users/hudsonmendes/Models/pretrained")
data_folder = Path("/Users/hudsonmendes/Workspaces/hudsonmendes-estudos/cm3065-isp/exercise-04/files")
transcriptions_filepath = data_folder / "Ex4_audio_files_transcriptions.csv"


## Dependencies


In [3]:
from typing import Dict, List, Tuple

import re
import numpy as np
import pandas as pd
from tqdm import tqdm


# Data


In [4]:
import unicodedata


def normalise_text(text):
    text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
    text = re.sub("[^A-Za-z0-9\ ]+", "", text)
    text = re.sub("[\s]+", " ", text)
    return text.lower().strip()


In [5]:
test_df = pd.read_csv(transcriptions_filepath, header=0)
test_df["y_true"] = test_df.y_true.map(normalise_text)
test_df


Unnamed: 0,locale,filename,y_true
0,en,Ex4_audio_files/EN/checkin_norm.wav,where is the checkin desk
1,en,Ex4_audio_files/EN/parents_norm.wav,i have lost my parents
2,en,Ex4_audio_files/EN/suitcase_norm.wav,please i have lost my suitcase
3,en,Ex4_audio_files/EN/what_time_norm.wav,what time is my plane
4,en,Ex4_audio_files/EN/where_norm.wav,where are the restaurants and shops
5,en,Ex4_audio_files/EN/your_sentence1.wav,i am in the library
6,en,Ex4_audio_files/EN/your_sentence2.wav,he is studying for the exam
7,it,Ex4_audio_files/IT/checkin_it_norm.wav,dove e il bancone
8,it,Ex4_audio_files/IT/parents_it_norm.wav,ho perso i miei genitori
9,it,Ex4_audio_files/IT/suitcase_it_norm.wav,per favore ho perso la mia valigia


# ASR


In [6]:
from abc import ABC, abstractclassmethod


class AbstractASR:
    @abstractclassmethod
    def transcribe(self, filepath: Path) -> str:
        raise NotImplementedError("You must implement the transcribe() method")


## Mozilla DeepSpeech


In [7]:
import deepspeech as ds
import librosa as lr


class DeepSpeechASR(AbstractASR):
    model: ds.Model

    def __init__(self, model_name: str, folder: Path, scorer_name: str = None):
        super(DeepSpeechASR, self).__init__()
        model_path = folder / f"{model_name}.pbmm"
        self.model = ds.Model(str(model_path))
        if not scorer_name:
            scorer_name = model_name
        scorer_path = folder / f"{scorer_name}.scorer"
        if scorer_path.is_file():
            self.model.enableExternalScorer(str(scorer_path))

    def transcribe(self, filepath: Path):
        audiofile = lr.load(filepath, sr=self.model.sampleRate())[0]
        audiofile = (audiofile * 32767).astype(np.int16)
        return self.model.stt(audiofile)


asr_deepspeech_en = DeepSpeechASR(model_name="deepspeech-0.9.3-models", folder=model_folder / "deepspeech")
asr_deepspeech_it = DeepSpeechASR(model_name="output_graph_it", folder=model_folder / "deepspeech")
asr_deepspeech_es = DeepSpeechASR(
    model_name="output_graph_es", scorer_name="kenlm_es", folder=model_folder / "deepspeech"
)

(
    ("en", asr_deepspeech_en.transcribe(filepath=data_folder / test_df[test_df.locale == "en"].filename.values[0])),
    ("it", asr_deepspeech_it.transcribe(filepath=data_folder / test_df[test_df.locale == "it"].filename.values[0])),
    ("es", asr_deepspeech_es.transcribe(filepath=data_folder / test_df[test_df.locale == "es"].filename.values[0])),
)


TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
2022-06-26 17:44:21.969274: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858


(('en', 'where is the checking desk'),
 ('it', 'dove e il vancone'),
 ('es', 'adande estan los mostradores'))

## SpeechBrain


In [8]:
from speechbrain.pretrained import EncoderDecoderASR


class SpeechBrainASR:
    def __init__(self, model_name: str, folder: Path):
        super(SpeechBrainASR, self).__init__()
        self.model = EncoderDecoderASR.from_hparams(source=f"speechbrain/{model_name}", savedir=folder)

    def transcribe(self, filepath: Path) -> str:
        return self.model.transcribe_file(str(filepath))


asr_speechbrain_en = SpeechBrainASR(model_name="asr-wav2vec2-commonvoice-en", folder=model_folder / "speechbrain")
asr_speechbrain_en.transcribe(filepath=data_folder / test_df[test_df.locale == "en"].filename.values[0])


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at facebook/wav2vec2-large-lv60 were not used when initializing Wav2Vec2Model: ['quantizer.weight_proj.bias', 'project_hid.weight', 'project_hid.bias', 'quantizer.weight_proj.weight', 'quantizer.codevectors', 'project_q.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
speechbrain.lobes.models.huggingface_wav2vec - wav2vec 2.0 is frozen.


'WHERE IS THE CHECK IN DESK'

## Facebook Wav2Vec


In [9]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import soundfile as sf


class FacebookWave2VecASR:
    def __init__(self):
        super(FacebookWave2VecASR, self).__init__()
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    def transcribe(self, filepath: Path) -> str:
        with filepath.open("rb") as fh:
            data, sr = sf.read(fh)
            inputs = self.processor(data, sampling_rate=sr, return_tensors="pt")
            with torch.no_grad():
                y = self.model(**inputs)
                logits = y.logits
                ids = torch.argmax(logits, dim=-1)
                return self.processor.batch_decode(ids)[0]


asr_fbwav2vec_en = FacebookWave2VecASR()
asr_fbwav2vec_en.transcribe(filepath=data_folder / test_df[test_df.locale == "en"].filename.values[0])


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'WHERE IS THE CHECKEN DESK'

# Transcription


## Deep Speech


In [10]:
def analyse(df: pd.DataFrame, folder: Path, asr_engine: AbstractASR) -> pd.DataFrame:
    df = df.copy()
    for ix, filename in df.filename.iteritems():
        y_pred = asr_engine.transcribe(filepath=folder / filename)
        df.at[ix, "y_pred"] = y_pred
    return df


In [11]:
results_deepspeech_en_df = analyse(test_df[test_df.locale == "en"], data_folder, asr_deepspeech_en)
results_deepspeech_en_df


Unnamed: 0,locale,filename,y_true,y_pred
0,en,Ex4_audio_files/EN/checkin_norm.wav,where is the checkin desk,where is the checking desk
1,en,Ex4_audio_files/EN/parents_norm.wav,i have lost my parents,i had lost my parents
2,en,Ex4_audio_files/EN/suitcase_norm.wav,please i have lost my suitcase,please i have lost my suitcase
3,en,Ex4_audio_files/EN/what_time_norm.wav,what time is my plane,what time is my plain
4,en,Ex4_audio_files/EN/where_norm.wav,where are the restaurants and shops,where are the restaurants and shops
5,en,Ex4_audio_files/EN/your_sentence1.wav,i am in the library,in the library
6,en,Ex4_audio_files/EN/your_sentence2.wav,he is studying for the exam,he is starting for the


In [12]:
results_deepspeech_it_df = analyse(test_df[test_df.locale == "it"], data_folder, asr_deepspeech_it)
results_deepspeech_it_df


Unnamed: 0,locale,filename,y_true,y_pred
7,it,Ex4_audio_files/IT/checkin_it_norm.wav,dove e il bancone,dove e il vancone
8,it,Ex4_audio_files/IT/parents_it_norm.wav,ho perso i miei genitori,operso i miei genitori
9,it,Ex4_audio_files/IT/suitcase_it_norm.wav,per favore ho perso la mia valigia,per favore o perso la mia valigia
10,it,Ex4_audio_files/IT/what_time_it_norm.wav,a che ora e il mio aereo,aceora i nio ereo
11,it,Ex4_audio_files/IT/where_it_norm.wav,dove sono i ristoranti e i negozi,dove sone ristoranti e inedofzi


In [13]:
results_deepspeech_es_df = analyse(test_df[test_df.locale == "es"], data_folder, asr_deepspeech_es)
results_deepspeech_es_df


Unnamed: 0,locale,filename,y_true,y_pred
12,es,Ex4_audio_files/ES/checkin_es_norm.wav,donde estan los mostradores,adande estan los mostradores
13,es,Ex4_audio_files/ES/parents_es_norm.wav,he perdido a mis padres,he perdido a mis padres
14,es,Ex4_audio_files/ES/suitcase_es_norm.wav,por favor he perdido mi maleta,por favor he perdido mi maleta
15,es,Ex4_audio_files/ES/what_time_es_norm.wav,a que hora es mi avion,mora es miedo
16,es,Ex4_audio_files/ES/where_es_norm.wav,donde estan los restaurantes y las tiendas,adande estan los restaurantes en las tiendas


## Comparative (DeepSpeech, SpeechBrain, FBWav2Vec)


In [14]:
def compare(df: pd.DataFrame, asr_engines: Dict[str, AbstractASR]) -> pd.DataFrame:
    df = df.copy()
    for ix, filename in tqdm(df.filename.iteritems(), total=len(df.filename)):
        for asr_name, asr_engine in asr_engines.items():
            y_pred = asr_engine.transcribe(filepath=data_folder / filename)
            df.at[ix, f"y_pred_{asr_name}"] = normalise_text(y_pred)
    return df


In [15]:
comparative_df = compare(
    df=test_df[test_df.locale == "en"],
    asr_engines={
        "deepspeech": asr_deepspeech_en,
        "speechbrain": asr_speechbrain_en,
        "fbwav2vec": asr_fbwav2vec_en,
    },
)
comparative_df


100%|██████████| 7/7 [00:30<00:00,  4.38s/it]


Unnamed: 0,locale,filename,y_true,y_pred_deepspeech,y_pred_speechbrain,y_pred_fbwav2vec
0,en,Ex4_audio_files/EN/checkin_norm.wav,where is the checkin desk,where is the checking desk,where is the check in desk,where is the checken desk
1,en,Ex4_audio_files/EN/parents_norm.wav,i have lost my parents,i had lost my parents,i have lost my parents,i have lost my parenis
2,en,Ex4_audio_files/EN/suitcase_norm.wav,please i have lost my suitcase,please i have lost my suitcase,please i have lost my suitcase,please owi have lost my siccesse
3,en,Ex4_audio_files/EN/what_time_norm.wav,what time is my plane,what time is my plain,what time is my plane,what time is my playing
4,en,Ex4_audio_files/EN/where_norm.wav,where are the restaurants and shops,where are the restaurants and shops,where are the restaurants and shops,where are the restaurats and shops
5,en,Ex4_audio_files/EN/your_sentence1.wav,i am in the library,in the library,i am in the library,i am in the library
6,en,Ex4_audio_files/EN/your_sentence2.wav,he is studying for the exam,he is starting for the,he is studying for the exam,hereis ha studying for theixam


# Evaluation (WER, or "Word Error Rate")

Calculated using the following formula:

$WER = \frac{S + D+ I}{N} \times 100$

where:
* $S$ is the number of substitutions
* $D$ is the number of deletions
* $I$ is the number of insertions
* $N$ is the number of words in your "y_true"

For the purpose of this project, the [jiwer library](https://pypi.org/project/jiwer/) will be used<br />
and we are going to macro-average the **WER Score** across all data points.

In [16]:
import jiwer

class Evaluator:
    df: pd.DataFrame

    def __init__(self, df: pd.DataFrame):
        self.df = df

    def evaluate(self, asr_name: str = None) -> pd.DataFrame:
        y_pred_column = "y_pred"
        if asr_name:
            y_pred_column = f"{y_pred_column}_{asr_name}"
        wers = []
        for y_true, y_pred in zip(self.df.y_true, self.df[y_pred_column]):
            wers.append(jiwer.wer(y_true, y_pred))
        return 100. * sum(wers) / len(wers)


## Deep Speech


In [17]:
Evaluator(results_deepspeech_en_df).evaluate()

19.047619047619044

In [18]:
Evaluator(results_deepspeech_it_df).evaluate()

47.28571428571429

In [19]:
Evaluator(results_deepspeech_es_df).evaluate()

27.38095238095238

## Comparative (DeepSearch, SpeechBrain, FBWav2Vec)

In [20]:
Evaluator(comparative_df).evaluate(asr_name='deepspeech')

19.047619047619044

In [21]:
Evaluator(comparative_df).evaluate(asr_name='speechbrain')

5.714285714285714

In [22]:
Evaluator(comparative_df).evaluate(asr_name='fbwav2vec')

25.238095238095237

# Conclusions

**DeepSpeech** has been evaluated in **English**, **Italian** & **Spanish** against the dataset<br />
of audio files provided with the course work. Additionally, for **English** we have also added<br />
two custom audioclips `your_sentence1.wav` and `your_sentence2.wav`.

**Note:** all audioclips were normalised prior to inference.

The resulting **Macro-averaged** WER (Word Error Rate) for each Language is:

| **locale** | **wer\_deepspeech** | **pass** |
|:----------:|--------------------:|:--------:|
| **en**     | `19.04%`            | yes      |
| **it**     | `47.28%`            | no       |
| **es**     | `27.38%`            | yes      |


Additionall, we have investigated alternative ASR systems for **English**, namely the<br />
**SpeechBrain** and **Facebook Wav2Vec**, and compared their results against **DeepSpeech**.<br />
Here are the results:

| **asr**           | **wer**  | **pass** |
|:-----------------:|---------:|:--------:|
| **deepsearch**    | `19.04%` | yes      |
| **speech\-brain** | ` 5.71%` | yes      |
| **fb\-wav2vec**   | `25.23%` | no       |

As a conclusion, **SpeechBrain** had the best performance in **English** with a `5.71%` WER Score.<br />
We also have acceptable results for **Spanish**. However, for **Italian**, even with the audio<br />
normalisation, the WER was above acceptable levels with `53%` WER Score.<br />
