In [37]:
import glob
import os

import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torchaudio



from art.attacks.evasion import ProjectedGradientDescent
from art.estimators.classification import PyTorchClassifier
from art import config
from art.defences.preprocessor import Mp3Compression
from art.utils import get_file

from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch

import scipy.io.wavfile as wav

import torchaudio

In [38]:
def display_waveform(waveform, title="", sr=16000):
  """Display waveform plot and audio play UI."""
  plt.figure()
  plt.title(title)
  plt.plot(waveform)
  ipd.display(ipd.Audio(waveform, rate=sr))

In [39]:
use_amp = False

In [40]:
speech_recognizer = PyTorchDeepSpeech(
  pretrained_model="librispeech",
  device_type="cpu",
  use_amp = use_amp
)

In [41]:
asr_attack = ImperceptibleASRPyTorch(
    estimator=speech_recognizer,
    eps=0.001,
    max_iter_1=5,
    max_iter_2=5,
    learning_rate_1=0.00001,
    learning_rate_2=0.001,
    optimizer_1=torch.optim.Adam,
    optimizer_2=torch.optim.Adam,
    global_max_length=100000,
    initial_rescale=1.0,
    decrease_factor_eps=0.8,
    num_iter_decrease_eps=5,
    alpha=0.01,
    increase_factor_alpha=1.2,
    num_iter_increase_alpha=5,
    decrease_factor_alpha=0.8,
    num_iter_decrease_alpha=5,
    win_length=2048,
    hop_length=512,
    n_fft=2048,
    batch_size=2,
    use_amp=use_amp,
    opt_level="O1",
)

In [43]:
sound, sample_rate = torchaudio.load("/mnt/d/course_project/ECE720/data/dataset_timit/data/TEST/DR1/MDAB0/SA1.WAV.wav")
sound = sound.numpy()

In [54]:
fs, audio = wav.read("/mnt/d/course_project/ECE720/data/dataset_timit/data/TEST/DR1/MDAB0/SA1.WAV.wav")
y = np.array(["She hate your dark suit in greasy wash water all year".upper()])

In [55]:
sound_adv = asr_attack.generate(sound, y)

In [None]:
sound_adv.shape

(1, 45466)

In [56]:
torchaudio.save("imperceptible.wav", src=torch.Tensor(sound_adv), sample_rate=16000)

In [57]:
transcriptions_preprocessing = speech_recognizer.predict(sound_adv, batch_size=1, transcription_output=True)

In [58]:
transcriptions_preprocessing

array(['SHE HAV YOUR DARK SOOTNIN GREASY WASH WATER ALL YEAR'],
      dtype='<U52')

In [None]:
org = speech_recognizer.predict(sound, batch_size=1, transcription_output=True)

  seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) // m.stride[1] + 1)


In [None]:
org

array(['HE PLAYED BASK OF ALL THERE WHILE WORKING TOWARD A LAW DEGREE'],
      dtype='<U61')

In [None]:
import numpy as np
from scipy.signal import gaussian
size = 5   # 滤波器大小
sigma = 1  # 标准差
filter_1d = gaussian(size, sigma)
filter_1d /= np.sum(filter_1d)
print(filter_1d.shape)
print(filter_1d)

(5,)
[0.05448868 0.24420134 0.40261995 0.24420134 0.05448868]


In [None]:
from scipy.signal import convolve
audio_data = np.array([1, 1, 1, 1, 1])
filtered_audio = convolve(audio_data, filter_1d, mode="same")

In [None]:
print(audio_data)
print(filtered_audio)

[1 1 1 1 1]
[0.70130997 0.94551132 1.         0.94551132 0.70130997]
