<a href="https://colab.research.google.com/github/hammaad2002/AdversarialAttack/blob/main/FGSM%20attack%20on%20CRDNN%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
#%%capture               
#!pip install speechbrain
#!git clone -b main https://github.com/hammaad2002/CRDNN_Model.git

In [59]:
import shutil
shutil.rmtree("/content/CRDNN_Model/result")

In [60]:
import pathlib
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml
import torch
import torchaudio
import sys

In [61]:
class CTCBrain(sb.Brain):
    def compute_forward(self, batch, stage, n = 0):
        if n == 1:
          wavs = batch
          wavs = wavs.to("cpu")
          lens = torch.tensor([1.])
          feats = self.modules.compute_features(wavs) 
          feats = self.modules.mean_var_norm(feats, lens)
          x = self.modules.model(feats)
          x = self.modules.lin(x)
          predictions = {"ctc_softmax": self.hparams.softmax(x)}
          predictions["seq"] = self.hparams.decoder(
                  predictions["ctc_softmax"], lens, blank_id=0)
          return predictions, lens       
        else:
          batch = batch.to(self.device)
          wavs, lens = batch.sig
          feats = self.modules.compute_features(wavs)
          feats = self.modules.mean_var_norm(feats, lens)
          x = self.modules.model(feats)
          x = self.modules.lin(x)
          predictions = {"ctc_softmax": self.hparams.softmax(x)}
          predictions["seq"] = self.hparams.decoder(
                   predictions["ctc_softmax"], lens, blank_id=self.hparams.blank_index)
          return predictions, lens

    def compute_objectives(self, predictions, batch, stage):
        predictions, lens = predictions
        phns, phn_lens = batch.phn_encoded
        decoded_phonemes = batch.phn_decoded
        label = batch.label_encoder
        label_encoder = label[0]
        loss = self.hparams.compute_cost(predictions["ctc_softmax"], phns, lens, phn_lens)
        if stage != sb.Stage.TRAIN:
            output = predictions["seq"]
            seq = output
            output1 = torch.tensor(output) 
            output = label_encoder.decode_torch(output1)
            self.per_metrics.append(batch.id, seq, phns, target_len=phn_lens, ind2lab = lambda x: label_encoder.decode_torch(torch.tensor(x)) )
        return loss

    def transcribe_dataset(
            self,
            dataset, 
            min_key, 
            label
          ):
        data_waveform, rate_of_sample = torchaudio.load(dataset)
        samples = data_waveform
        self.on_evaluate_start(min_key=min_key)
        self.modules.eval() 
        with torch.no_grad():
                out = self.compute_forward(samples, stage=sb.Stage.TEST, n = 1) 
                p_seq, wav_lens = out
        output = p_seq["seq"]
        output = torch.tensor(output)
        #output = label.decode_torch(output)
        return output

    def on_stage_start(self, stage, epoch=None):
        "Gets called when a stage (either training, validation, test) starts."
        if stage != sb.Stage.TRAIN:
            self.per_metrics = self.hparams.per_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["PER"] = self.per_metrics.summarize("error_rate")
        if stage == sb.Stage.VALID and epoch is not None:
            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"PER": stage_stats["PER"]}, min_keys=["PER"],
            )
        elif stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )
            with open(self.hparams.per_file, "w") as f:
              self.per_metrics.write_stats(f)
  
    def fast_gradient_sign_method(self, audio, phns, epsilon=0.02, device = "cpu"):
        # Freeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = False
        phn_lens = torch.tensor([1.])
        data_waveform, rate_of_sample = torchaudio.load(audio)
        samples = data_waveform
        # Determine prediction of the model
        inp_audio = samples.clone().requires_grad_()
        preds, len = self.compute_forward(inp_audio, stage=sb.Stage.TEST, n = 1)
        # Calculate loss by CTC
        loss = self.hparams.compute_cost(preds["ctc_softmax"], phns, len, phn_lens)
        loss.backward()
        # Update audio to adversarial example as written above
        noise_grad = torch.sign(inp_audio.grad.to(device))
        fake_audio = samples + epsilon * noise_grad
        fake_audio.detach_()
        # Unfreeze the weights of the model
        for param in self.modules.parameters():
            param.requires_grad = True
        return fake_audio, noise_grad

In [62]:
def data_prep(data_folder, hparams, n = 0, evaluate = False):
    if evaluate == True: 
        ev_data = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path= hparams["json"] ,
            replacements={"data_root": data_folder} ,
        )
        datasets = [ev_data]
        label_encoder = sb.dataio.encoder.CTCTextEncoder()
        @sb.utils.data_pipeline.takes("wav")
        @sb.utils.data_pipeline.provides("sig")
        def audio_pipeline(wav):
            sig = sb.dataio.dataio.read_audio(wav)
            return sig

        sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
        @sb.utils.data_pipeline.takes("phn")  
        @sb.utils.data_pipeline.provides("phn_list", "phn_encoded","phn_decoded","label_encoder")
        def text_pipeline(phn):
            phn_list = phn.strip().split()
            mapped_phonemes = {
                "iy": "iy",
                "ix": "ix",
                "ih": "ix",
                "eh": "eh",
                "ae": "ae",
                "ax": "ax",
                "ah": "ax",
                "ax-h": "ax",
                "uw": "uw",
                "ux": "uw",
                "uh": "uh",
                "ao": "ao",
                "aa": "ao",
                "ey": "ey",
                "ay": "ay",
                "oy": "oy",
                "aw": "aw",
                "ow": "ow",
                "er": "er",
                "axr": "er",
                "l": "l",
                "el": "l",
                "r": "r",
                "w": "w",
                "y": "y",
                "m": "m",
                "em": "m",
                "n": "n",
                "en": "n",
                "nx": "n",
                "ng": "ng",
                "eng": "ng",
                "v": "v",
                "f": "f",
                "dh": "dh",
                "th": "th",
                "z": "z",
                "s": "s",
                "zh": "zh",
                "sh": "zh",
                "jh": "jh",
                "ch": "ch",
                "b": "b",
                "p": "p",
                "d": "d",
                "dx": "dx",
                "t": "t",
                "g": "g",
                "k": "k",
                "hh": "hh",
                "hv": "hh",
                "bcl": "h#",
                "pcl": "h#",
                "dcl": "h#",
                "tcl": "h#",
                "gcl": "h#",
                "kcl": "h#",
                "q": "h#",
                "epi": "h#",
                "pau": "h#",
                "h#": "h#"
                }
            def map_phonemes(original_phonemes):
              mapped_phonemes_list = []
              for phoneme in original_phonemes:
                mapped_phoneme = mapped_phonemes.get(phoneme, None)
                if mapped_phoneme:
                  mapped_phonemes_list.append(mapped_phoneme)
              return mapped_phonemes_list
            phn_list = map_phonemes(phn_list)
            yield phn_list
            phn_encoded = label_encoder.encode_sequence_torch(phn_list)
            phn_decoded = label_encoder.decode_torch(phn_encoded)
            yield phn_encoded
            yield phn_decoded
            yield label_encoder
        sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
        sb.dataio.dataset.set_output_keys(datasets, ["id", "sig", "phn_encoded", "phn_decoded","label_encoder"])
        if n == 1:
          return label_encoder
        else:
          return ev_data

    else:
        train_data = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path= hparams["json_train"] ,
            replacements={"data_root": data_folder} ,
        )
        valid_data = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path= hparams["json_train"] ,
            replacements={"data_root": data_folder} ,
        )
        datasets = [train_data, valid_data]
        label_encoder = sb.dataio.encoder.CTCTextEncoder()
        @sb.utils.data_pipeline.takes("wav")
        @sb.utils.data_pipeline.provides("sig")
        def audio_pipeline(wav):
            sig = sb.dataio.dataio.read_audio(wav)
            return sig

        sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
        @sb.utils.data_pipeline.takes("phn")  
        @sb.utils.data_pipeline.provides("phn_list", "phn_encoded","phn_decoded","label_encoder")
        def text_pipeline(phn):
            phn_list = phn.strip().split()
            mapped_phonemes = {
                "iy": "iy",
                "ix": "ix",
                "ih": "ix",
                "eh": "eh",
                "ae": "ae",
                "ax": "ax",
                "ah": "ax",
                "ax-h": "ax",
                "uw": "uw",
                "ux": "uw",
                "uh": "uh",
                "ao": "ao",
                "aa": "ao",
                "ey": "ey",
                "ay": "ay",
                "oy": "oy",
                "aw": "aw",
                "ow": "ow",
                "er": "er",
                "axr": "er",
                "l": "l",
                "el": "l",
                "r": "r",
                "w": "w",
                "y": "y",
                "m": "m",
                "em": "m",
                "n": "n",
                "en": "n",
                "nx": "n",
                "ng": "ng",
                "eng": "ng",
                "v": "v",
                "f": "f",
                "dh": "dh",
                "th": "th",
                "z": "z",
                "s": "s",
                "zh": "zh",
                "sh": "zh",
                "jh": "jh",
                "ch": "ch",
                "b": "b",
                "p": "p",
                "d": "d",
                "dx": "dx",
                "t": "t",
                "g": "g",
                "k": "k",
                "hh": "hh",
                "hv": "hh",
                "bcl": "h#",
                "pcl": "h#",
                "dcl": "h#",
                "tcl": "h#",
                "gcl": "h#",
                "kcl": "h#",
                "q": "h#",
                "epi": "h#",
                "pau": "h#",
                "h#": "h#"
                }
            def map_phonemes(original_phonemes):
              mapped_phonemes_list = []
              for phoneme in original_phonemes:
                mapped_phoneme = mapped_phonemes.get(phoneme, None)
                if mapped_phoneme:
                  mapped_phonemes_list.append(mapped_phoneme)
              return mapped_phonemes_list
            phn_list = map_phonemes(phn_list)
            yield phn_list
            phn_encoded = label_encoder.encode_sequence_torch(phn_list)
            phn_decoded = label_encoder.decode_torch(phn_encoded)
            yield phn_encoded
            yield phn_decoded
            yield label_encoder
        sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
        label_encoder.insert_blank(index=hparams["blank_index"])
        label_encoder.update_from_didataset(train_data, output_key="phn_list")
        label_encoder.update_from_didataset(valid_data, output_key="phn_list")
        sb.dataio.dataset.set_output_keys(datasets, ["id", "sig", "phn_encoded", "phn_decoded","label_encoder"])
        if n == 1:
          return label_encoder
        else:
          return train_data, valid_data

In [63]:
device="cpu"

In [64]:
hparams_file = "/content/CRDNN_Model/hyperparams.yaml"
with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin)
sb.create_experiment_directory(
    experiment_directory=hparams["output_folder"],
    hyperparams_to_save=hparams_file,
    save_env_desc = True,
)
data_folder = hparams["data_folder"]
train_data, valid_data = data_prep(data_folder, hparams, n = 0)
label_encoder = data_prep(data_folder, hparams, n = 1)
ctc_brain = CTCBrain(
    hparams["modules"],
    hparams["opt_class"],
    hparams,
    run_opts={"device": device},
    checkpointer=hparams["checkpointer"],
)    
#c1
ctc_brain.fit(
    hparams["epoch_counter"],
    train_data,
    valid_data,
    train_loader_kwargs=hparams["dataloader_options"],
    valid_loader_kwargs=hparams["dataloader_options"],
)
ctc_brain.evaluate(
    valid_data,
    min_key="PER",
)

speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: /content/CRDNN_Model/result
speechbrain.core - 1.5M trainable parameters in CTCBrain
speechbrain.utils.checkpoints - Would load a checkpoint here, but none found yet.
speechbrain.utils.epoch_loop - Going into epoch 1


100%|██████████| 8/8 [00:04<00:00,  1.72it/s, train_loss=13.9]
100%|██████████| 8/8 [00:01<00:00,  6.32it/s]

speechbrain.utils.train_logger - epoch: 1 - train loss: 13.94 - valid loss: 8.69, valid PER: 84.76
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-15+00





speechbrain.utils.epoch_loop - Going into epoch 2


100%|██████████| 8/8 [00:03<00:00,  2.39it/s, train_loss=7.74]
100%|██████████| 8/8 [00:01<00:00,  6.50it/s]

speechbrain.utils.train_logger - epoch: 2 - train loss: 7.74 - valid loss: 6.04, valid PER: 86.59
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-20+00





speechbrain.utils.epoch_loop - Going into epoch 3


100%|██████████| 8/8 [00:03<00:00,  2.30it/s, train_loss=4.91]
100%|██████████| 8/8 [00:01<00:00,  6.34it/s]

speechbrain.utils.train_logger - epoch: 3 - train loss: 4.91 - valid loss: 3.85, valid PER: 84.15
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-25+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-15+00
speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-20+00
speechbrain.utils.epoch_loop - Going into epoch 4


100%|██████████| 8/8 [00:04<00:00,  1.78it/s, train_loss=3.65]
100%|██████████| 8/8 [00:01<00:00,  6.14it/s]

speechbrain.utils.train_logger - epoch: 4 - train loss: 3.65 - valid loss: 3.23, valid PER: 84.76
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-30+00





speechbrain.utils.epoch_loop - Going into epoch 5


100%|██████████| 8/8 [00:03<00:00,  2.37it/s, train_loss=2.98]
100%|██████████| 8/8 [00:01<00:00,  6.33it/s]

speechbrain.utils.train_logger - epoch: 5 - train loss: 2.98 - valid loss: 2.84, valid PER: 81.10
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-35+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-30+00
speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-25+00
speechbrain.utils.epoch_loop - Going into epoch 6


100%|██████████| 8/8 [00:03<00:00,  2.39it/s, train_loss=2.61]
100%|██████████| 8/8 [00:01<00:00,  6.36it/s]

speechbrain.utils.train_logger - epoch: 6 - train loss: 2.61 - valid loss: 2.57, valid PER: 79.88
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-40+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-35+00
speechbrain.utils.epoch_loop - Going into epoch 7


100%|██████████| 8/8 [00:03<00:00,  2.40it/s, train_loss=2.23]
100%|██████████| 8/8 [00:01<00:00,  6.40it/s]

speechbrain.utils.train_logger - epoch: 7 - train loss: 2.23 - valid loss: 2.39, valid PER: 76.22
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-45+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-40+00
speechbrain.utils.epoch_loop - Going into epoch 8


100%|██████████| 8/8 [00:03<00:00,  2.39it/s, train_loss=1.95]
100%|██████████| 8/8 [00:01<00:00,  6.45it/s]

speechbrain.utils.train_logger - epoch: 8 - train loss: 1.95 - valid loss: 1.91, valid PER: 59.76
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-50+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-45+00
speechbrain.utils.epoch_loop - Going into epoch 9


100%|██████████| 8/8 [00:03<00:00,  2.38it/s, train_loss=1.64]
100%|██████████| 8/8 [00:01<00:00,  6.23it/s]

speechbrain.utils.train_logger - epoch: 9 - train loss: 1.64 - valid loss: 1.63, valid PER: 53.05
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-54+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-50+00
speechbrain.utils.epoch_loop - Going into epoch 10


100%|██████████| 8/8 [00:03<00:00,  2.42it/s, train_loss=1.37]
100%|██████████| 8/8 [00:01<00:00,  6.40it/s]

speechbrain.utils.train_logger - epoch: 10 - train loss: 1.37 - valid loss: 1.46, valid PER: 52.44
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-59+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-54+00
speechbrain.utils.epoch_loop - Going into epoch 11


100%|██████████| 8/8 [00:03<00:00,  2.37it/s, train_loss=1.19]
100%|██████████| 8/8 [00:01<00:00,  6.16it/s]

speechbrain.utils.train_logger - epoch: 11 - train loss: 1.19 - valid loss: 1.26, valid PER: 40.24
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-04+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-32-59+00
speechbrain.utils.epoch_loop - Going into epoch 12


100%|██████████| 8/8 [00:03<00:00,  2.43it/s, train_loss=1.03]
100%|██████████| 8/8 [00:01<00:00,  6.41it/s]

speechbrain.utils.train_logger - epoch: 12 - train loss: 1.03 - valid loss: 9.10e-01, valid PER: 26.83
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-09+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-04+00
speechbrain.utils.epoch_loop - Going into epoch 13


100%|██████████| 8/8 [00:03<00:00,  2.38it/s, train_loss=0.792]
100%|██████████| 8/8 [00:01<00:00,  6.57it/s]

speechbrain.utils.train_logger - epoch: 13 - train loss: 7.92e-01 - valid loss: 6.64e-01, valid PER: 18.90
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-13+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-09+00
speechbrain.utils.epoch_loop - Going into epoch 14


100%|██████████| 8/8 [00:03<00:00,  2.37it/s, train_loss=0.601]
100%|██████████| 8/8 [00:01<00:00,  6.52it/s]

speechbrain.utils.train_logger - epoch: 14 - train loss: 6.01e-01 - valid loss: 5.56e-01, valid PER: 17.68
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-18+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-13+00
speechbrain.utils.epoch_loop - Going into epoch 15


100%|██████████| 8/8 [00:03<00:00,  2.44it/s, train_loss=0.447]
100%|██████████| 8/8 [00:01<00:00,  6.63it/s]

speechbrain.utils.train_logger - epoch: 15 - train loss: 4.47e-01 - valid loss: 4.13e-01, valid PER: 11.59
speechbrain.utils.checkpoints - Saved an end-of-epoch checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-23+00





speechbrain.utils.checkpoints - Deleted checkpoint in /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-18+00
speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-23+00


100%|██████████| 8/8 [00:01<00:00,  6.44it/s]

speechbrain.utils.train_logger - Epoch loaded: 15 - test loss: 4.13e-01, test PER: 11.59





0.4131469167768955

In [65]:
transcripts = ctc_brain.transcribe_dataset(
        dataset= '/content/CRDNN_Model/AudioSamplesASR/spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
print("Phoneme Transcription is below:")
print(transcripts)
print(label_encoder.decode_torch(transcripts))

speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-23+00
Phoneme Transcription is below:
tensor([[ 1,  4,  5,  6,  7,  5,  8,  9, 11, 12, 13, 11,  1,  2, 10,  8,  7,  5,
          6,  7]])
[['dh', 'ay', 'l', 'd', 'ao', 'l', 'm', 'ow', 't', 'hh', 'er', 't', 'dh', 'ax', 's', 'm', 'ao', 'l', 'd', 'ao']]




```
def fast_gradient_sign_method(model, imgs, labels, epsilon=0.02):
    # Determine prediction of the model
    inp_imgs = imgs.clone().requires_grad_()
    preds = model(inp_imgs.to(device))
    preds = F.log_softmax(preds, dim=-1)
    # Calculate loss by NLL
    loss = -torch.gather(preds, 1, labels.to(device).unsqueeze(dim=-1))
    loss.sum().backward()
    # Update image to adversarial example as written above
    noise_grad = torch.sign(inp_imgs.grad.to(imgs.device))
    fake_imgs = imgs + epsilon * noise_grad
    fake_imgs.detach_()
    return fake_imgs, noise_grad
```



In [66]:
audio = '/content/CRDNN_Model/AudioSamplesASR/spk1_snt1.wav'
phns = transcripts
fake_audio, noise_grad = ctc_brain.fast_gradient_sign_method( 
    audio, 
    phns,
    epsilon=0.001
)

In [67]:
import librosa
import IPython.display as ipd

# Load the audio file
audio, sr = librosa.load('/content/CRDNN_Model/AudioSamplesASR/spk1_snt1.wav')

# Play the audio
ipd.display(ipd.Audio(audio, rate=sr))
ipd.display(ipd.Audio(fake_audio, rate=16000))

In [68]:
sr = 16000
torchaudio.save('/content/perturbed_spk1_snt1.wav', fake_audio, sr)

In [69]:
transcripts1 = ctc_brain.transcribe_dataset(
        dataset= '/content/CRDNN_Model/AudioSamplesASR/spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
transcripts2 = ctc_brain.transcribe_dataset(
        dataset= '/content/perturbed_spk1_snt1.wav',
        min_key="PER",
        label = label_encoder 
)
print("Phoneme Transcription before noise is :")
print(transcripts1)
print(label_encoder.decode_torch(transcripts1))
print("Phoneme Transcription after noise is :")
print(transcripts2)
print(label_encoder.decode_torch(transcripts2))

speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-23+00
speechbrain.utils.checkpoints - Loading a checkpoint from /content/CRDNN_Model/result/save/CKPT+2023-01-21+12-33-23+00
Phoneme Transcription before noise is :
tensor([[ 1,  4,  5,  6,  7,  5,  8,  9, 11, 12, 13, 11,  1,  2, 10,  8,  7,  5,
          6,  7]])
[['dh', 'ay', 'l', 'd', 'ao', 'l', 'm', 'ow', 't', 'hh', 'er', 't', 'dh', 'ax', 's', 'm', 'ao', 'l', 'd', 'ao']]
Phoneme Transcription after noise is :
tensor([[ 1,  4,  5,  6,  6,  5,  6,  1,  6, 11, 11,  1, 10,  1,  6]])
[['dh', 'ay', 'l', 'd', 'd', 'l', 'd', 'dh', 'd', 't', 't', 'dh', 's', 'dh', 'd']]


In [70]:
%%capture
!pip install python-Levenshtein

**This is Levenshtein distance. It is a string metric for measuring difference between two sequences**

In [72]:
import Levenshtein
def compute_dis(ground_truth, predictions):
    num_errors = Levenshtein.distance(''.join(ground_truth), ''.join(predictions))
    total_phones = len(ground_truth)
    per = num_errors / total_phones
    return per
distance = compute_dis(label_encoder.decode_torch(transcripts1.squeeze()), label_encoder.decode_torch(transcripts2.squeeze()))
print(distance)

0.85
