<a href="https://colab.research.google.com/github/hammaad2002/AdversarialAttack/blob/main/FGSM%20attack%20on%20CRDNN%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
#%%capture               
#!pip install speechbrain
#!git clone -b main https://github.com/hammaad2002/CRDNN_Model.git

In [74]:
import shutil
shutil.rmtree("/content/CRDNN_Model/result")

In [75]:
import pathlib
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml
import torch
import torchaudio
import sys
import numpy as np
import torch.nn as nn

In [76]:
class CTCBrain(sb.Brain):
    def compute_forward(self, batch, stage, n = 0):
        if n == 1:
          wavs = batch
          wavs = wavs.to("cpu")
          lens = torch.tensor([1.])
          feats = self.modules.compute_features(wavs) 
          feats = self.modules.mean_var_norm(feats, lens)
          x = self.modules.model(feats)
          x = self.modules.lin(x)
          predictions = {"ctc_softmax": self.hparams.softmax(x)}
          predictions["seq"] = self.hparams.decoder(
                  predictions["ctc_softmax"], lens, blank_id=0)
          return predictions, lens       
        else:
          batch = batch.to(self.device)
          wavs, lens = batch.sig
          feats = self.modules.compute_features(wavs)
          feats = self.modules.mean_var_norm(feats, lens)
          x = self.modules.model(feats)
          x = self.modules.lin(x)
          predictions = {"ctc_softmax": self.hparams.softmax(x)}
          predictions["seq"] = self.hparams.decoder(
                   predictions["ctc_softmax"], lens, blank_id=self.hparams.blank_index)
          return predictions, lens

    def compute_objectives(self, predictions, batch, stage):
        predictions, lens = predictions
        phns, phn_lens = batch.phn_encoded
        decoded_phonemes = batch.phn_decoded
        label = batch.label_encoder
        label_encoder = label[0]
        loss = self.hparams.compute_cost(predictions["ctc_softmax"], phns, lens, phn_lens)
        if stage != sb.Stage.TRAIN:
            output = predictions["seq"]
            seq = output
            output1 = torch.tensor(output) 
            output = label_encoder.decode_torch(output1)
            self.per_metrics.append(batch.id, seq, phns, target_len=phn_lens, ind2lab = lambda x: label_encoder.decode_torch(torch.tensor(x)) )
        return loss

    def transcribe_dataset(
            self,
            dataset, 
            min_key, 
            label
          ):
        data_waveform, rate_of_sample = torchaudio.load(dataset)
        samples = data_waveform
        self.on_evaluate_start(min_key=min_key)
        self.modules.eval() 
        with torch.no_grad():
                out = self.compute_forward(samples, stage=sb.Stage.TEST, n = 1) 
                p_seq, wav_lens = out
        output = p_seq["seq"]
        output = torch.tensor(output)
        #output = label.decode_torch(output)
        return output

    def on_stage_start(self, stage, epoch=None):
        "Gets called when a stage (either training, validation, test) starts."
        if stage != sb.Stage.TRAIN:
            self.per_metrics = self.hparams.per_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["PER"] = self.per_metrics.summarize("error_rate")
        if stage == sb.Stage.VALID and epoch is not None:
            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"PER": stage_stats["PER"]}, min_keys=["PER"],
            )
        elif stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )
            with open(self.hparams.per_file, "w") as f:
              self.per_metrics.write_stats(f)
  
    def fast_gradient_sign_method(self, audio, phns, epsilon=0.02, device = "cpu"):
        # Freeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = False
        phn_lens = torch.tensor([1.])
        data_waveform, rate_of_sample = torchaudio.load(audio)
        samples = data_waveform.to(device)
        # Determine prediction of the model
        inp_audio = samples.clone().requires_grad_()
        preds, len = self.compute_forward(inp_audio, stage=sb.Stage.TEST, n = 1)
        # Calculate loss by CTC
        loss = self.hparams.compute_cost(preds["ctc_softmax"], phns, len, phn_lens)
        loss.backward()
        # Update audio to adversarial example as written above
        noise_grad = torch.sign(inp_audio.grad.to(device))
        fake_audio = samples + epsilon * noise_grad
        fake_audio.detach_()
        # Unfreeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = True
        return fake_audio, epsilon * noise_grad

    def random_attack(self, audio, eps=0.3, order=np.inf, clip_min=None, clip_max=None, device = "cpu"):
        def rand_assign(delta, order, eps):
            """Randomly set the data of parameter delta with uniform sampling"""
            delta.data.uniform_(-1, 1)
            if isinstance(eps, torch.Tensor):
                eps = eps.view(-1, 1)
            if order == np.inf:
                delta.data = eps * delta.data
        # Freeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = False
        data_waveform, rate_of_sample = torchaudio.load(audio)
        samples = data_waveform.to(device)
        # Clone the original audio
        wav_init = torch.clone(samples)
        delta = torch.zeros_like(wav_init)
        delta = nn.Parameter(delta)
        clip_min = clip_min if clip_min is not None else -10
        clip_max = clip_max if clip_max is not None else 10
        rand_assign(delta, order, eps)
        delta.data = (
            torch.clamp(wav_init + delta.data, min=clip_min,
                        max=clip_max) - wav_init
        )
        wav_adv = wav_init + delta.data
        # Unfreeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = True
        return wav_adv, delta.data

    def mix_fast_gradient_sign_method(self, audio, phns, epsilon=0.02, device = "cpu", random_noise=False, noise_eps=0.3, noise_order=np.inf, noise_clip_min=None, noise_clip_max=None):
        def rand_assign(delta, order, eps):
            """Randomly set the data of parameter delta with uniform sampling"""
            delta.data.uniform_(-1, 1)
            if isinstance(eps, torch.Tensor):
                eps = eps.view(-1, 1)
            if order == np.inf:
                delta.data = eps * delta.data
        # Freeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = False
        phn_lens = torch.tensor([1.])
        data_waveform, rate_of_sample = torchaudio.load(audio)
        samples = data_waveform.to(device)
        # Determine prediction of the model
        inp_audio = samples.clone().requires_grad_()
        preds, len = self.compute_forward(inp_audio, stage=sb.Stage.TEST, n = 1)
        # Calculate loss by CTC
        loss = self.hparams.compute_cost(preds["ctc_softmax"], phns, len, phn_lens)
        loss.backward()
        # Update audio to adversarial example as written above
        noise_grad = torch.sign(inp_audio.grad.to(device))
        fake_audio = samples + epsilon * noise_grad
        if random_noise:
            delta = torch.zeros_like(fake_audio)
            delta = nn.Parameter(delta)
            noise_clip_min = noise_clip_min if noise_clip_min is not None else -10
            noise_clip_max = noise_clip_max if noise_clip_max is not None else 10
            rand_assign(delta, noise_order, noise_eps)
            delta.data = (
                torch.clamp(fake_audio + delta.data, min=noise_clip_min,
                            max=noise_clip_max) - fake_audio
            )
            fake_audio = fake_audio + delta.data
        fake_audio.detach_()
        # Unfreeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = True
        return fake_audio, epsilon * noise_grad if not random_noise else delta.data

    def new_fast_gradient_sign_method(self, audio, phns, epsilon_iterations=10, epsilon_size=0.02, device = "cpu", noise_clip_min=None, noise_clip_max=None):
        # Freeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = False
        phn_lens = torch.tensor([1.])
        data_waveform, rate_of_sample = torchaudio.load(audio)
        samples = data_waveform.to(device)
        fake_audio = samples
        epsilon = epsilon_size
        for i in range(epsilon_iterations):
            # Determine prediction of the model
            inp_audio = fake_audio.clone().requires_grad_()
            preds, len = self.compute_forward(inp_audio, stage=sb.Stage.TEST, n = 1)
            # Calculate loss by CTC
            loss = self.hparams.compute_cost(preds["ctc_softmax"], phns, len, phn_lens)
            loss.backward()
            # Update audio to adversarial example as written above
            noise_grad = torch.sign(inp_audio.grad.to(device))
            fake_audio = fake_audio + epsilon * noise_grad
            if noise_clip_min is not None or noise_clip_max is not None:
                fake_audio = torch.clamp(fake_audio, min=noise_clip_min, max=noise_clip_max)
        # Unfreeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = True
        return fake_audio


    def carlini_wagner_method(self, audio, phns, epsilon=0.02, device = "cpu", lambda_=0.01):
        # Freeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = False
        phn_lens = torch.tensor([1.])
        data_waveform, rate_of_sample = torchaudio.load(audio)
        samples = data_waveform
        # Determine prediction of the model
        inp_audio = samples.clone().requires_grad_()
        preds, len = self.compute_forward(inp_audio, stage=sb.Stage.TEST, n = 1)
        # Define a loss function L(x,y,f) that measures the difference between the model's prediction f(x) and the correct label y
        L = self.hparams.compute_cost(preds["ctc_softmax"], phns, len, phn_lens)
        # Define a perturbation variable delta
        delta = torch.randn_like(inp_audio)
        delta = delta.to(device)
        delta = torch.nn.Parameter(delta)
        delta.requires_grad = True
        # Define a regularization term for L-2 distance between the original input and adversarial example
        r = lambda_ * torch.norm(delta)**2
        # Define an optimization function that minimizes the loss function L while constraining the perturbation delta
        opt = torch.optim.Adam([delta], lr=0.01)
        with torch.autograd.set_detect_anomaly(True):
          for step in range(100):
              #loss = L + r
              loss = torch.add(L,r)
              opt.zero_grad() 
              loss.backward(retain_graph=True)
              opt.step()
              delta = torch.nn.Parameter(torch.randn_like(inp_audio).to(device))
              delta.requires_grad = True
        # Generate adversarial example
        inp_audio.detach_()
        #adv_audio = torch.add(inp_audio.clone(),epsilon * delta)
        adv_audio = torch.add(inp_audio, epsilon * delta)
        # Unfreeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = True
        return adv_audio, delta


In [77]:
def data_prep(data_folder, hparams, n = 0, evaluate = False):
    if evaluate == True: 
        ev_data = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path= hparams["json"] ,
            replacements={"data_root": data_folder} ,
        )
        datasets = [ev_data]
        label_encoder = sb.dataio.encoder.CTCTextEncoder()
        @sb.utils.data_pipeline.takes("wav")
        @sb.utils.data_pipeline.provides("sig")
        def audio_pipeline(wav):
            sig = sb.dataio.dataio.read_audio(wav)
            return sig

        sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
        @sb.utils.data_pipeline.takes("phn")  
        @sb.utils.data_pipeline.provides("phn_list", "phn_encoded","phn_decoded","label_encoder")
        def text_pipeline(phn):
            phn_list = phn.strip().split()
            mapped_phonemes = {
                "iy": "iy",
                "ix": "ix",
                "ih": "ix",
                "eh": "eh",
                "ae": "ae",
                "ax": "ax",
                "ah": "ax",
                "ax-h": "ax",
                "uw": "uw",
                "ux": "uw",
                "uh": "uh",
                "ao": "ao",
                "aa": "ao",
                "ey": "ey",
                "ay": "ay",
                "oy": "oy",
                "aw": "aw",
                "ow": "ow",
                "er": "er",
                "axr": "er",
                "l": "l",
                "el": "l",
                "r": "r",
                "w": "w",
                "y": "y",
                "m": "m",
                "em": "m",
                "n": "n",
                "en": "n",
                "nx": "n",
                "ng": "ng",
                "eng": "ng",
                "v": "v",
                "f": "f",
                "dh": "dh",
                "th": "th",
                "z": "z",
                "s": "s",
                "zh": "zh",
                "sh": "zh",
                "jh": "jh",
                "ch": "ch",
                "b": "b",
                "p": "p",
                "d": "d",
                "dx": "dx",
                "t": "t",
                "g": "g",
                "k": "k",
                "hh": "hh",
                "hv": "hh",
                "bcl": "h#",
                "pcl": "h#",
                "dcl": "h#",
                "tcl": "h#",
                "gcl": "h#",
                "kcl": "h#",
                "q": "h#",
                "epi": "h#",
                "pau": "h#",
                "h#": "h#"
                }
            def map_phonemes(original_phonemes):
              mapped_phonemes_list = []
              for phoneme in original_phonemes:
                mapped_phoneme = mapped_phonemes.get(phoneme, None)
                if mapped_phoneme:
                  mapped_phonemes_list.append(mapped_phoneme)
              return mapped_phonemes_list
            phn_list = map_phonemes(phn_list)
            yield phn_list
            phn_encoded = label_encoder.encode_sequence_torch(phn_list)
            phn_decoded = label_encoder.decode_torch(phn_encoded)
            yield phn_encoded
            yield phn_decoded
            yield label_encoder
        sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
        sb.dataio.dataset.set_output_keys(datasets, ["id", "sig", "phn_encoded", "phn_decoded","label_encoder"])
        if n == 1:
          return label_encoder
        else:
          return ev_data

    else:
        train_data = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path= hparams["json_train"] ,
            replacements={"data_root": data_folder} ,
        )
        valid_data = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path= hparams["json_train"] ,
            replacements={"data_root": data_folder} ,
        )
        datasets = [train_data, valid_data]
        label_encoder = sb.dataio.encoder.CTCTextEncoder()
        @sb.utils.data_pipeline.takes("wav")
        @sb.utils.data_pipeline.provides("sig")
        def audio_pipeline(wav):
            sig = sb.dataio.dataio.read_audio(wav)
            return sig

        sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
        @sb.utils.data_pipeline.takes("phn")  
        @sb.utils.data_pipeline.provides("phn_list", "phn_encoded","phn_decoded","label_encoder")
        def text_pipeline(phn):
            phn_list = phn.strip().split()
            mapped_phonemes = {
                "iy": "iy",
                "ix": "ix",
                "ih": "ix",
                "eh": "eh",
                "ae": "ae",
                "ax": "ax",
                "ah": "ax",
                "ax-h": "ax",
                "uw": "uw",
                "ux": "uw",
                "uh": "uh",
                "ao": "ao",
                "aa": "ao",
                "ey": "ey",
                "ay": "ay",
                "oy": "oy",
                "aw": "aw",
                "ow": "ow",
                "er": "er",
                "axr": "er",
                "l": "l",
                "el": "l",
                "r": "r",
                "w": "w",
                "y": "y",
                "m": "m",
                "em": "m",
                "n": "n",
                "en": "n",
                "nx": "n",
                "ng": "ng",
                "eng": "ng",
                "v": "v",
                "f": "f",
                "dh": "dh",
                "th": "th",
                "z": "z",
                "s": "s",
                "zh": "zh",
                "sh": "zh",
                "jh": "jh",
                "ch": "ch",
                "b": "b",
                "p": "p",
                "d": "d",
                "dx": "dx",
                "t": "t",
                "g": "g",
                "k": "k",
                "hh": "hh",
                "hv": "hh",
                "bcl": "h#",
                "pcl": "h#",
                "dcl": "h#",
                "tcl": "h#",
                "gcl": "h#",
                "kcl": "h#",
                "q": "h#",
                "epi": "h#",
                "pau": "h#",
                "h#": "h#"
                }
            def map_phonemes(original_phonemes):
              mapped_phonemes_list = []
              for phoneme in original_phonemes:
                mapped_phoneme = mapped_phonemes.get(phoneme, None)
                if mapped_phoneme:
                  mapped_phonemes_list.append(mapped_phoneme)
              return mapped_phonemes_list
            phn_list = map_phonemes(phn_list)
            yield phn_list
            phn_encoded = label_encoder.encode_sequence_torch(phn_list)
            phn_decoded = label_encoder.decode_torch(phn_encoded)
            yield phn_encoded
            yield phn_decoded
            yield label_encoder
        sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
        label_encoder.insert_blank(index=hparams["blank_index"])
        label_encoder.update_from_didataset(train_data, output_key="phn_list")
        label_encoder.update_from_didataset(valid_data, output_key="phn_list")
        sb.dataio.dataset.set_output_keys(datasets, ["id", "sig", "phn_encoded", "phn_decoded","label_encoder"])
        if n == 1:
          return label_encoder
        else:
          return train_data, valid_data

In [78]:
device="cpu"

In [79]:
hparams_file = "/content/CRDNN_Model/hyperparams.yaml"
with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin)
sb.create_experiment_directory(
    experiment_directory=hparams["output_folder"],
    hyperparams_to_save=hparams_file,
    save_env_desc = True,
)
data_folder = hparams["data_folder"]
train_data, valid_data = data_prep(data_folder, hparams, n = 0)
label_encoder = data_prep(data_folder, hparams, n = 1)
ctc_brain = CTCBrain(
    hparams["modules"],
    hparams["opt_class"],
    hparams,
    run_opts={"device": device},
    checkpointer=hparams["checkpointer"],
)    
#c1
ctc_brain.fit(
    hparams["epoch_counter"],
    train_data,
    valid_data,
    train_loader_kwargs=hparams["dataloader_options"],
    valid_loader_kwargs=hparams["dataloader_options"],
)
ctc_brain.evaluate(
    valid_data,
    min_key="PER",
)

speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: /content/CRDNN_Model/result
speechbrain.core - 1.5M trainable parameters in CTCBrain
speechbrain.utils.checkpoints - Would load a checkpoint here, but none found yet.
speechbrain.utils.epoch_loop - Going into epoch 1


100%|██████████| 8/8 [00:03<00:00,  2.54it/s, train_loss=13.9]
100%|██████████| 8/8 [00:01<00:00,  6.96it/s]

speechbrain.utils.train_logger - epoch: 1 - train loss: 13.94 - valid loss: 8.69, valid PER: 84.76
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+21-50-13+00
speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+21-50-13+00



100%|██████████| 8/8 [00:01<00:00,  6.99it/s]

speechbrain.utils.train_logger - Epoch loaded: 1 - test loss: 8.69, test PER: 84.76





8.689325153827667

In [80]:
transcripts = ctc_brain.transcribe_dataset(
        dataset= '/content/CRDNN_Model/AudioSamplesASR/spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
print("Phoneme Transcription is below:")
print(transcripts)
print(label_encoder.decode_torch(transcripts))

speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+21-50-13+00
Phoneme Transcription is below:
tensor([[11, 11, 14,  4,  6, 11,  4, 11, 11, 11,  6,  4,  6]])
[['t', 't', 'r', 'ay', 'd', 't', 'ay', 't', 't', 't', 'd', 'ay', 'd']]




```
def fast_gradient_sign_method(model, imgs, labels, epsilon=0.02):
    # Determine prediction of the model
    inp_imgs = imgs.clone().requires_grad_()
    preds = model(inp_imgs.to(device))
    preds = F.log_softmax(preds, dim=-1)
    # Calculate loss by NLL
    loss = -torch.gather(preds, 1, labels.to(device).unsqueeze(dim=-1))
    loss.sum().backward()
    # Update image to adversarial example as written above
    noise_grad = torch.sign(inp_imgs.grad.to(imgs.device))
    fake_imgs = imgs + epsilon * noise_grad
    fake_imgs.detach_()
    return fake_imgs, noise_grad
```



In [81]:
audio = '/content/CRDNN_Model/AudioSamplesASR/spk1_snt1.wav'
phns = transcripts
fake_audio, noise_grad = ctc_brain.fast_gradient_sign_method( 
    audio, 
    phns,
    epsilon=0.001
)

In [82]:
wav_adv, delta = ctc_brain.random_attack( 
    audio,
    eps=0.001, 
    order=np.inf, 
    clip_min=-2, 
    clip_max=2,
)

In [83]:
wav_advv, deltaa = ctc_brain.mix_fast_gradient_sign_method( 
    audio,
    phns,
    epsilon=0.0001, 
    random_noise=True,
    noise_eps=0.0001, 
    noise_order=np.inf, 
    noise_clip_min=-10, 
    noise_clip_max=10,
)

In [96]:
new_adv= ctc_brain.new_fast_gradient_sign_method( 
    audio,
    phns,
    epsilon_iterations=30,
    epsilon_size=0.00001, 
    noise_clip_min=-0.5, 
    noise_clip_max=0.5,
)

In [97]:
import librosa
import IPython.display as ipd

# Load the audio file
audioo, sr = librosa.load('/content/CRDNN_Model/AudioSamplesASR/spk1_snt1.wav')

# Play the audio
ipd.display(ipd.Audio(audioo, rate=sr))
ipd.display(ipd.Audio(fake_audio, rate=16000))
ipd.display(ipd.Audio(wav_adv, rate=16000))
ipd.display(ipd.Audio(wav_advv, rate=16000))
ipd.display(ipd.Audio(new_adv, rate=16000))

In [98]:
sr = 16000
torchaudio.save('/content/perturbed_spk1_snt1.wav', fake_audio, sr)
torchaudio.save('/content/perturbedRandom_spk1_snt1.wav', wav_adv, sr)
torchaudio.save('/content/perturbedRandomm_spk1_snt1.wav', wav_advv, sr)
torchaudio.save('/content/perturbedNew_spk1_snt1.wav', new_adv, sr)

In [99]:
transcripts1 = ctc_brain.transcribe_dataset(
        dataset= '/content/CRDNN_Model/AudioSamplesASR/spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
transcripts2 = ctc_brain.transcribe_dataset(
        dataset= '/content/perturbed_spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
transcripts3 = ctc_brain.transcribe_dataset(
        dataset= '/content/perturbedRandom_spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
transcripts4 = ctc_brain.transcribe_dataset(
        dataset= '/content/perturbedRandomm_spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
transcripts5 = ctc_brain.transcribe_dataset(
        dataset= '/content/perturbedNew_spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
print("Phoneme Transcription before noise is :")
print(transcripts1)
print(label_encoder.decode_torch(transcripts1))
print("Phoneme Transcription after noise is :")
print(transcripts2)
print(label_encoder.decode_torch(transcripts2))
print("Phoneme Transcription after noise is :")
print(transcripts3)
print(label_encoder.decode_torch(transcripts3))
print("Phoneme Transcription after noise is :")
print(transcripts4)
print(label_encoder.decode_torch(transcripts4))
print("Phoneme Transcription after noise is :")
print(transcripts5)
print(label_encoder.decode_torch(transcripts5))

speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+21-50-13+00
speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+21-50-13+00
speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+21-50-13+00
speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+21-50-13+00
speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+21-50-13+00
Phoneme Transcription before noise is :
tensor([[11, 11, 14,  4,  6, 11,  4, 11, 11, 11,  6,  4,  6]])
[['t', 't', 'r', 'ay', 'd', 't', 'ay', 't', 't', 't', 'd', 'ay', 'd']]
Phoneme Transcription after noise is :
tensor([[11,  6, 11,  6, 14,  4,  6, 11, 11,  4, 14,  4, 14, 11, 11, 14,  6]])
[['t', 'd', 't', 'd', 'r', 'ay', 'd', 't', 't', 'ay', 'r', 'ay', 'r', 't', 't', 'r', 'd']]
Phoneme Transcription 

In [91]:
%%capture
!pip install python-Levenshtein

**This is Levenshtein distance. It is a string metric for measuring difference between two sequences**

In [100]:
import Levenshtein
def compute_dis(ground_truth, predictions):
    num_errors = Levenshtein.distance(''.join(ground_truth), ''.join(predictions))
    total_phones = len(ground_truth)
    per = num_errors / total_phones
    return per
distance = compute_dis(label_encoder.decode_torch(transcripts1.squeeze()), label_encoder.decode_torch(transcripts2.squeeze()))
print(distance)
#Lower value means both answer match more, indicating poor attack
#Higher the value the better the attack is.

0.7692307692307693


In [101]:
distance = compute_dis(label_encoder.decode_torch(transcripts1.squeeze()), label_encoder.decode_torch(transcripts3.squeeze()))
print(distance)

0.46153846153846156


In [102]:
distance = compute_dis(label_encoder.decode_torch(transcripts1.squeeze()), label_encoder.decode_torch(transcripts4.squeeze()))
print(distance)

0.6923076923076923


In [104]:
distance = compute_dis(label_encoder.decode_torch(transcripts1.squeeze()), label_encoder.decode_torch(transcripts5.squeeze()))
print(distance)

2.3076923076923075
