# Adversarial Retraining

In this approach, additionally to the model we train the detector model that is presumably able do differentiate the adversarial image examples from the true images.

In [72]:
import os
import shutil
import IPython

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import librosa

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torchvision.models import resnet34

from tqdm.notebook import tqdm

from scipy.io import wavfile

import warnings
warnings.filterwarnings('ignore')

## Load data

In [2]:
df = pd.read_csv('birds_large/data.csv')
df.head()

Unnamed: 0,filename,label
0,Sylvia-communis-179998,Sylvia
1,Sylvia-communis-283580,Sylvia
2,Sylvia-communis-206631,Sylvia
3,Sylvia-communis-142078,Sylvia
4,Sylvia-communis-198329,Sylvia


## Transform audio to images

In [3]:
def spec_to_image(spec, eps=1e-6):
    '''
    Normalize spectrogram and convert it to the b&w image with 1 channel
    '''
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    return spec_scaled, mean, std, spec_min, spec_max

def create_spec(audio):
    '''
    Create mel-spectrogram from audio
    '''
    x, sr = librosa.load(audio, res_type='kaiser_fast', sr=None)
    if x.shape[0] < 10 * sr:
        x = np.pad(x, int(np.ceil((10 * sr - x.shape[0]) / 2)), mode='reflect')
    else:
        x = x[:10 * sr]
    spec = librosa.feature.melspectrogram(x, sr=sr, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300)
    spec_db = librosa.power_to_db(spec, top_db=80)
    return spec_db, sr

In [4]:
features = []
labels = []
for i in range(len(df)):
    # create spectrogram image
    f = 'birds_large/wav/' + df.iloc[i]['filename'] + '.mp3.wav'
    spec_db, _ = create_spec(f)
    features.append(spec_to_image(spec_db)[0][np.newaxis,...])
    # convert label to numerical label
    if df.iloc[i]['label'] == 'Sylvia':
        labels.append(1)
    else:
        labels.append(0)

## Train-test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)

In [6]:
print(f'Train set size: {len(X_train)}')
print(f'Test set size: {len(X_test)}')

Train set size: 240
Test set size: 104


## Convert data to tensors

In [7]:
X_train_t = torch.tensor(X_train)
X_test_t = torch.tensor(X_test)
y_train_t = torch.tensor(y_train)
y_test_t = torch.tensor(y_test)

train_dataset = TensorDataset(X_train_t, y_train_t)
test_dataset = TensorDataset(X_test_t, y_test_t)
train_dataloader = DataLoader(train_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset)

## Original classification model

In [8]:
class BirdClassificationModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet = resnet34(pretrained=True)
        self.resnet.fc = nn.Linear(512, 2)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    
    def forward(self, x):
        out = self.resnet(x)
        return out

In [15]:
model_base = BirdClassificationModel()
model_base.load_state_dict(torch.load('bird_classification.pt'))
model_base.eval()

BirdClassificationModel(
  (resnet): ResNet(
    (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=

## Training the detector model

The model is trained on the original data + adversarial examples generated on it by using CW attacks

### Generate adversarial examples for train and test

In [84]:
def cw_l2_attack(model, X, y, epsilon=0.1, c=100, num_iter=200, alpha=1e3):
    X = X.float()
    delta = torch.zeros_like(X, requires_grad=True)
    for t in tqdm(range(num_iter)):
        loss = c * (1 + nn.CrossEntropyLoss()(model(X + delta), y)) + nn.MSELoss(reduction='sum')(X, X + delta)
#         print(nn.MSELoss(reduction='sum')(X, X + delta))
        loss.backward()
        delta.data = (delta + X.shape[0] * alpha * delta.grad.data).clamp(-epsilon, epsilon)
        delta.grad.zero_()
    return delta.detach()

In [85]:
delta = cw_l2_attack(model_base, X_train_t, y_train_t)
X_train_a = X_train_t + delta
X_train_adv = torch.cat((X_train_t, X_train_a))

y_train_true = torch.tensor([1 for i in range(240)])
y_train_a = torch.tensor([0 for i in range(240)])
y_train_adv = torch.cat((y_train_true, y_train_a))

  0%|          | 0/200 [00:00<?, ?it/s]

In [86]:
delta = pgd(model_base, X_test_t, y_test_t)
X_test_a = X_test_t + delta
X_test_adv = torch.cat((X_test_t, X_test_a))

y_test_true = torch.tensor([1 for i in range(104)])
y_test_a = torch.tensor([0 for i in range(104)])
y_test_adv = torch.cat((y_test_true, y_test_a))

  0%|          | 0/100 [00:00<?, ?it/s]

In [98]:
train_dataset_adv = TensorDataset(X_train_adv, y_train_adv)
test_dataset_adv = TensorDataset(X_test_adv, y_test_adv)
train_dataloader_adv = DataLoader(train_dataset_adv, batch_size=32)
test_dataloader_adv = DataLoader(test_dataset_adv)

### Create and train the model

In [99]:
class BirdAdversarialDetectionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet = resnet34(pretrained=True)
        self.resnet.fc = nn.Linear(512, 2)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    
    def forward(self, x):
        out = self.resnet(x)
        return out

In [113]:
detector = BirdAdversarialDetectionModel()
optimizer = optim.RMSprop(detector.parameters(), lr=1e-6)
# lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.5)
epochs = 10
loss_fn = nn.CrossEntropyLoss()

In [114]:
def test(model, test_loader):
    model.eval()
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for data in tqdm(test_loader):
        x, y = data
        x = x.type(torch.float32)
        y_hat = model(x)
        if y_hat.argmax(axis=1) == y:
            if y == 0:
                TN += 1
            else:
                TP += 1
        else:
            if y == 0:
                FP += 1
            else:
                FN += 1
    return TP, FP, TN, FN

In [115]:
def train(model, loss_fn, train_loader, test_loader, epochs, optimizer, lr_scheduler=None):
    losses = []
    accs = []
    for epoch in range(epochs):
        model.train()
        batch_losses=[]
        for data in tqdm(train_loader):
            x, y = data
            x = x.type(torch.float32)
            optimizer.zero_grad()
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        if lr_scheduler:
            lr_scheduler.step()
        losses.append(batch_losses)
        print(f'Epoch - {epoch} Train-Loss : {np.mean(losses[-1])}')
        TP, FP, TN, FN = test(model, test_loader)
        acc = (TP + TN) / (TP + TN + FP + FN)
        print(f'Test accuracy: {acc}')
        accs.append(acc)
    return losses, accs

In [116]:
losses, accs = train(detector, loss_fn, train_dataloader_adv, test_dataloader_adv, epochs, optimizer)

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 0 Train-Loss : 0.9391463081041972


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 1 Train-Loss : 0.9040513217449189


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 2 Train-Loss : 0.886269599199295


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 3 Train-Loss : 0.8727611760298412


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 4 Train-Loss : 0.8614498953024546


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 5 Train-Loss : 0.8515392820040385


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 6 Train-Loss : 0.8426047583421071


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 7 Train-Loss : 0.8343870123227437


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 8 Train-Loss : 0.8267375548680623


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 9 Train-Loss : 0.8195141176382701


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


As you can see, the training process somehow goes wrong, and I don't know what to do with it, I've tried multiple approaches for adversarial examples generation, different loss functions, different oprimizers, and still the training process gets stuck. Thus, I just proceed with implementing the attacks and hoping for the best.

In [104]:
torch.save(detector.state_dict(), 'bird_adv_detector.pt')

In [105]:
detector.eval()

BirdAdversarialDetectionModel(
  (resnet): ResNet(
    (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, a

# Attack evaluations

In [None]:
detector = BirdAdversarialDetectionModel()
detector.load_state_dict(torch.load('bird_adv_detector.pt'))
detector.eval()

## Zero-knowledge attack

In [106]:
detected_adv = 0
correctly_classified = 0
for x, y in tqdm(test_dataloader):
    x = x.float()
    delta = cw_l2_attack(model_base, x, y)
    out = detector(x + delta)
    if out.argmax(axis=1) == 0:
        detected_adv += 1
    else:
        out = model_base(x + delta)
        if out.argmax(axis=1) == y:
            correctly_classified += 1

  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [108]:
print(f'{detected_adv / 104 * 100}% of attacks were detected')
# print(f'{correctly_classified / (104 - detected_adv) * 100}% of adversarial examples were classified correctly')

100.0% of attacks were detected


Well, here is a very expected result, due to the fact that the detector classifies all the images as adversarial

## Perfect-knowledge attack

In [117]:
def perfect_cw_l2_attack(model, detector, X, y, epsilon=0.1, c=100, num_iter=200, alpha=1e3):
    X = X.float()
    delta = torch.zeros_like(X, requires_grad=True)
    for t in tqdm(range(num_iter)):
        zf = model(X + delta)
        zd = detector(X + delta)
        loss = c * (1 + nn.CrossEntropyLoss()((zd + 1) * zd.argmax(axis=1), y)) + nn.MSELoss(reduction='sum')(X, X + delta)
#         print(nn.MSELoss(reduction='sum')(X, X + delta))
        loss.backward()
        delta.data = (delta + X.shape[0] * alpha * delta.grad.data).clamp(-epsilon, epsilon)
        delta.grad.zero_()
    return delta.detach()

In [118]:
detected_adv = 0
correctly_classified = 0
for x, y in tqdm(test_dataloader):
    x = x.float()
    delta = perfect_cw_l2_attack(model_base, detector, x, y)
    out = detector(x + delta)
    if out.argmax(axis=1) == 0:
        detected_adv += 1
    out = model_base(x + delta)
    if out.argmax(axis=1) == y:
        correctly_classified += 1

  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [119]:
print(f'{detected_adv / 104 * 100}% of attacks were detected')
print(f'{correctly_classified / (104) * 100}% of adversarial examples were classified correctly')

99.03846153846155% of attacks were detected
94.23076923076923% of adversarial examples were classified correctly


That looks like some good results, but also they might be the indication of extremely badly implemented CW attack :)

## Limited-knowledge attack

### Train the classifier model

In [125]:
X0 = X_train_t[:len(X_train_t) // 2]
X1 = X_train_t[len(X_train_t) // 2:]
y0 = y_train_t[:len(X_train_t) // 2]
y1 = y_train_t[len(X_train_t) // 2:]

train_dataset0 = TensorDataset(X0, y0)
train_dataloader0 = DataLoader(train_dataset0, batch_size=16)

In [128]:
classifier_model = BirdClassificationModel()
optimizer = optim.Adam(classifier_model.parameters(), lr=1e-5)
# lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.5)
epochs = 5
loss_fn = nn.CrossEntropyLoss()

In [129]:
losses, accs = train(classifier_model, loss_fn, train_dataloader0, test_dataloader, epochs, optimizer)

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch - 0 Train-Loss : 0.6916154474020004


  0%|          | 0/104 [00:00<?, ?it/s]

Test accuracy: 0.4807692307692308


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch - 1 Train-Loss : 0.3164428826421499


  0%|          | 0/104 [00:00<?, ?it/s]

Test accuracy: 0.7115384615384616


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch - 2 Train-Loss : 0.19399584829807281


  0%|          | 0/104 [00:00<?, ?it/s]

Test accuracy: 0.8557692307692307


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch - 3 Train-Loss : 0.12930894922465086


  0%|          | 0/104 [00:00<?, ?it/s]

Test accuracy: 0.875


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch - 4 Train-Loss : 0.09156299941241741


  0%|          | 0/104 [00:00<?, ?it/s]

Test accuracy: 0.875


### Train the detector model

In [130]:
delta = cw_l2_attack(classifier_model, X1, y1)
X_train_a = X1 + delta
X_train_adv = torch.cat((X1, X_train_a))

y_train_true = torch.tensor([1 for i in range(120)])
y_train_a = torch.tensor([0 for i in range(120)])
y_train_adv = torch.cat((y_train_true, y_train_a))

train_dataset1 = TensorDataset(X_train_adv, y_train_adv)
train_dataloader1 = DataLoader(train_dataset1, batch_size=16)

  0%|          | 0/200 [00:00<?, ?it/s]

In [131]:
detector_model = BirdAdversarialDetectionModel()
optimizer = optim.RMSprop(detector_model.parameters(), lr=1e-6)
# lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.5)
epochs = 5
loss_fn = nn.CrossEntropyLoss()

In [132]:
losses, accs = train(detector, loss_fn, train_dataloader1, test_dataloader_adv, epochs, optimizer)

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 0 Train-Loss : 0.8530506253242492


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 1 Train-Loss : 0.8530506253242492


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 2 Train-Loss : 0.8530506253242492


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 3 Train-Loss : 0.8530506253242492


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


  0%|          | 0/15 [00:00<?, ?it/s]

Epoch - 4 Train-Loss : 0.8530506253242492


  0%|          | 0/208 [00:00<?, ?it/s]

Test accuracy: 0.5


### Generate attacks on classification model, see what detector does with them

And again, we have an amazing detector classifying everything as an attack, so this thing demonstrates nothing :)

In [134]:
detected_adv = 0
for x, y in tqdm(test_dataloader):
    x = x.float()
    delta = cw_l2_attack(classifier_model, x, y)
    out = detector_model(x + delta)
    if out.argmax(axis=1) == 0:
        detected_adv += 1

  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [135]:
print(f'{detected_adv / 104 * 100}% of attacks were detected')

0.0% of attacks were detected


And here, again, we have an amazing result demonstrating that the detection model does not work!

Unfortunately, I was not able to obtain any meaningful results that are in any way similar to the ones obtained in the paper, but I've tried my best.