## Settings


In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
from pathlib import Path

model_folder = Path("/Users/hudsonmendes/Models/pretrained")
data_folder = Path("/Users/hudsonmendes/Workspaces/hudsonmendes-estudos/cm3065-isp/exercise-04/files")
transcriptions_filepath = data_folder / 'Ex4_audio_files_transcriptions.csv'


## Dependencies


In [20]:
import re
import numpy as np
import pandas as pd


# Data


In [25]:
import unicodedata

def normalise_text(text): 
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    text = re.sub('[^A-Za-z0-9\ ]+', '', text)
    text = re.sub('[\s]+', ' ', text)
    return text.lower().strip()

In [34]:
test_df = pd.read_csv(transcriptions_filepath, header=0)
test_df["y_true"] = test_df.y_true.map(normalise_text)
test_df

Unnamed: 0,locale,filename,y_true
0,en,Ex4_audio_files/EN/checkin.wav,where is the checkin desk
1,en,Ex4_audio_files/EN/parents.wav,i have lost my parents
2,en,Ex4_audio_files/EN/suitcase.wav,please i have lost my suitcase
3,en,Ex4_audio_files/EN/what_time.wav,what time is my plane
4,en,Ex4_audio_files/EN/where.wav,where are the restaurants and shops
5,it,Ex4_audio_files/IT/checkin_it.wav,dove e il bancone
6,it,Ex4_audio_files/IT/parents_it.wav,ho perso i miei genitori
7,it,Ex4_audio_files/IT/suitcase_it.wav,per favore ho perso la mia valigia
8,it,Ex4_audio_files/IT/what_time_it.wav,a che ora e il mio aereo
9,it,Ex4_audio_files/IT/where_it.wav,dove sono i ristoranti e i negozi


# ASR


In [35]:
from abc import ABC, abstractclassmethod

class AbstractASR:
    
    @abstractclassmethod
    def transcribe(self, filepath: Path) -> str:
        raise NotImplementedError("You must implement the transcribe() method")

## Mozilla DeepSpeech


In [36]:
import deepspeech as ds
import librosa as lr


class DeepSpeechASR(AbstractASR):
    model: ds.Model

    def __init__(self, model_name: str, folder: Path, scorer_name: str = None):
        super(DeepSpeechASR, self).__init__()
        model_path = folder / f"{model_name}.pbmm"
        self.model = ds.Model(str(model_path))
        if not scorer_name:
            scorer_name = model_name
        scorer_path = folder / f"{scorer_name}.scorer"
        if scorer_path.is_file():
            self.model.enableExternalScorer(str(scorer_path))

    def transcribe(self, filepath: Path):
        audiofile = lr.load(filepath, sr=self.model.sampleRate())[0]
        audiofile = (audiofile * 32767).astype(np.int16)
        return self.model.stt(audiofile)


asr_deepspeech_en = DeepSpeechASR(model_name="deepspeech-0.9.3-models", folder=model_folder / 'deepspeech')
asr_deepspeech_it = DeepSpeechASR(model_name="output_graph_it", folder=model_folder / 'deepspeech')
asr_deepspeech_es = DeepSpeechASR(model_name="output_graph_es", scorer_name="kenlm_es", folder=model_folder / 'deepspeech')

(
    ("en", asr_deepspeech_en.transcribe(filepath=data_folder / test_df[test_df.locale == 'en'].filename.values[0])),
    ("it", asr_deepspeech_it.transcribe(filepath=data_folder / test_df[test_df.locale == 'it'].filename.values[0])),
    ("es", asr_deepspeech_es.transcribe(filepath=data_folder / test_df[test_df.locale == 'es'].filename.values[0])),
)


TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858


(('en', 'where is the checking desk'),
 ('it', 'dove er il bancone'),
 ('es', 'adande estan los mostradores'))

## SpeechBrain


In [37]:
from speechbrain.pretrained import EncoderDecoderASR

class SpeechBrainASR:
    def __init__(self, model_name: str, folder: Path):
        super(SpeechBrainASR, self).__init__()
        self.model = EncoderDecoderASR.from_hparams(
            source=f"speechbrain/{model_name}",
            savedir=folder)

    def transcribe(self, filepath: Path) -> str:
        return self.model.transcribe_file(str(filepath))

asr_speechbrain_en = SpeechBrainASR(model_name="asr-wav2vec2-commonvoice-en", folder=model_folder / 'speechbrain')
asr_speechbrain_en.transcribe(filepath=data_folder / test_df[test_df.locale == 'en'].filename.values[0])


Some weights of the model checkpoint at facebook/wav2vec2-large-lv60 were not used when initializing Wav2Vec2Model: ['project_hid.bias', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'project_q.weight', 'project_q.bias', 'project_hid.weight', 'quantizer.codevectors']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
speechbrain.lobes.models.huggingface_wav2vec - wav2vec 2.0 is frozen.


'WHERE IS THE CHECK IN DESK'

## Facebook Wav2Vec


In [39]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import soundfile as sf

class FacebookWave2VecASR:

    def __init__(self):
        super(FacebookWave2VecASR, self).__init__()
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    def transcribe(self, filepath: Path) -> str:
        with filepath.open('rb') as fh:
            data, sr = sf.read(fh)
            inputs = self.processor([data], sampling_rate=sr, return_tensors="pt")
            with torch.no_grad():
                y = self.model(**inputs)
                logits = y.logits
                ids = torch.argmax(logits, dim=-1)
                return self.processor.batch_decode(ids)[0]

asr_fbw2v_en = FacebookWave2VecASR()
asr_fbw2v_en.transcribe(filepath=data_folder / test_df[test_df.locale == 'en'].filename.values[0])

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'WHERE IS THE CHECKEN DESK'

# Transcribe


In [44]:
def analyse(df: pd.DataFrame, folder: Path, asr_engine: AbstractASR) -> pd.DataFrame:
    df = df.copy()
    for ix, filename in df.filename.iteritems():
        y_pred = asr_engine.transcribe(filepath=folder / filename)
        df.at[ix, 'y_pred'] = y_pred
    return df

In [48]:
results_deepspeech_df_en = analyse(test_df[test_df.locale == 'en'], data_folder, asr_deepspeech_en)
results_deepspeech_df_en

In [None]:
results_deepspeech_it = analyse(test_df[test_df.locale == 'it'], data_folder, asr_deepspeech_it)
results_deepspeech_it

Unnamed: 0,locale,filename,y_true,y_pred
5,it,Ex4_audio_files/IT/checkin_it.wav,dove e il bancone,dove er il bancone
6,it,Ex4_audio_files/IT/parents_it.wav,ho perso i miei genitori,operso i miei genitori
7,it,Ex4_audio_files/IT/suitcase_it.wav,per favore ho perso la mia valigia,per favore o perso la mia valigia
8,it,Ex4_audio_files/IT/what_time_it.wav,a che ora e il mio aereo,ar nio ereo
9,it,Ex4_audio_files/IT/where_it.wav,dove sono i ristoranti e i negozi,dove sone ristorantie nedozi


In [None]:
results_deepspeech_es = analyse(test_df[test_df.locale == 'es'], data_folder, asr_deepspeech_es)
results_deepspeech_es

Unnamed: 0,locale,filename,y_true,y_pred
10,es,Ex4_audio_files/ES/checkin_es.wav,donde estan los mostradores,adande estan los mostradores
11,es,Ex4_audio_files/ES/parents_es.wav,he perdido a mis padres,he perdido a mis padres
12,es,Ex4_audio_files/ES/suitcase_es.wav,por favor he perdido mi maleta,por favor he perdido mi maleta
13,es,Ex4_audio_files/ES/what_time_es.wav,a que hora es mi avion,ahora es miedo
14,es,Ex4_audio_files/ES/where_es.wav,donde estan los restaurantes y las tiendas,adande estan los restaurantes en las tierras
