In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd drive/MyDrive/IW06-07/

In [None]:
!pip3 install speechbrain
!pip3 install deepspeech-gpu
!pip3 install jiwer
%pip install torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [None]:
!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer

In [None]:
import deepspeech
import wave
import numpy as np
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement
from jiwer import compute_measures
import os

Deepspeech model

In [None]:
model_file_path = 'deepspeech-0.9.3-models.pbmm'
model = deepspeech.Model(model_file_path)
scorer_file_path = 'deepspeech-0.9.3-models.scorer'
model.enableExternalScorer(scorer_file_path)
lm_alpha = 0.931289039105002
lm_beta = 1.1834137581510284
model.setScorerAlphaBeta(lm_alpha, lm_beta)

In [None]:
def convert(model, audio):
  w = wave.open(audio, 'r')
  assert int(w.getframerate()) == 16000
  data = np.frombuffer(w.readframes(w.getnframes()), dtype=np.int16)
  return model.stt(data)
def diff(o, a, e, c):
  # adv-ori, enh-ori, enh-act
  if len(o) == 0:
    if len(a) != 0:
      return 0, compute_measures(a, o)['wer'], compute_measures(c, e)['wer']
    return 0, 0, compute_measures(c, e)['wer']
  return compute_measures(o, a)['wer'], compute_measures(o, e)['wer'], compute_measures(c, e)['wer']

Speech Enhancement Model

In [None]:
enhance_model = SpectralMaskEnhancement.from_hparams(
    source="speechbrain/mtl-mimic-voicebank",
    savedir="pretrained_models/mtl-mimic-voicebank",
    run_opts={"device":"cuda"},
)

Adversarial Dataset A Parsing

In [None]:
from pathlib import Path

In [None]:
commands = ['down', 'go', 'left', 'right', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']
original = './adversarial_dataset-B/Adversarial-Examples/Original-Examples/'
adversarial = './adversarial_dataset-B/Adversarial-Examples/Adversarial-Examples/'
enhanced = './enhanced/baseline/'
# get wav files 
audio_files = {}
for command in commands:
  files = os.listdir(original+command)
  audio_files[command] = files

In [None]:
def call_enhance_wer(adv_command, editor):
  # get commands
  wer_stats = [0]*3
  tot = 0
  for command in commands:
    temp_stats = [0]*3
    if command == adv_command:
      continue
    # dir adv_command/command
    files = audio_files[command]
    base_adv_path = adversarial + adv_command + '/' + command + '/'
    base_ori_path = original + '/' + command + '/'
    base_enh_path = enhanced + adv_command + '/' + command + '/'
    Path(base_enh_path).mkdir(parents=True, exist_ok=True)

    editor.write(adv_command + ', ' + command + '\n')

    for audio in files:
      ori = convert(model, base_ori_path + audio)
      a_file = base_adv_path + audio
      enhanced_audio = enhance_model.enhance_file(a_file)
      enh_name = base_enh_path + audio
      torchaudio.save(enh_name, enhanced_audio.unsqueeze(0).cpu(), sample_rate=16000, bits_per_sample=16)

      enh = convert(model, enh_name)
      adv = convert(model, a_file)
      editor.write(ori + ', ' + adv + ', ' + enh + '\n')
      
      # adv-ori, enh-ori, enh-act
      stats = diff(ori, adv, enh, command)
      for i in range(len(stats)):
        temp_stats[i] += stats[i]
        wer_stats[i] += stats[i]
      tot += 1

    editor.write('\nadv-ori, enh-ori, enh-act: ')
    editor.write(str([i/len(files) for i in temp_stats]) + '\n\n')

  editor.write('\n\nadv-ori, enh-ori, enh-act: ')
  editor.write(str([i/tot for i in wer_stats]))


## Call models

In [None]:
for command in commands:
  editor = open(enhanced + command + '/stats.txt', 'w+')
  call_enhance_wer(command, editor)
  editor.close()