<a href="https://colab.research.google.com/github/jyaacoub/Cross-Domain-Attacks-NLP/blob/main/Similar_Domain_Same_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers torchinfo textattack -q

[K     |████████████████████████████████| 5.5 MB 7.8 MB/s 
[K     |████████████████████████████████| 418 kB 56.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 49.0 MB/s 
[K     |████████████████████████████████| 163 kB 63.1 MB/s 
[K     |████████████████████████████████| 41.4 MB 1.8 MB/s 
[K     |████████████████████████████████| 3.6 MB 43.6 MB/s 
[K     |████████████████████████████████| 125 kB 64.7 MB/s 
[K     |████████████████████████████████| 365 kB 64.7 MB/s 
[K     |████████████████████████████████| 401 kB 58.4 MB/s 
[K     |████████████████████████████████| 60 kB 8.4 MB/s 
[K     |████████████████████████████████| 769 kB 67.0 MB/s 
[K     |████████████████████████████████| 212 kB 66.4 MB/s 
[K     |████████████████████████████████| 95 kB 5.8 MB/s 
[K     |████████████████████████████████| 115 kB 65.4 MB/s 
[K     |████████████████████████████████| 127 kB 68.7 MB/s 
[K     |████████████████████████████████| 53 kB 2.0 MB/s 
[K     |██████████████████████

In [None]:
import pandas as pd
from tqdm import tqdm
from IPython.core.display import HTML, display

import torch
from torchinfo import summary

import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

import textattack
from textattack import Attack, Attacker, AttackArgs
from textattack.datasets import HuggingFaceDataset
from textattack.loggers import CSVLogger

from textattack.attack_recipes import (
    A2TYoo2021,
    BAEGarg2019,
    BERTAttackLi2020,
    CLARE2020,
    CheckList2020,
    DeepWordBugGao2018,
    FasterGeneticAlgorithmJia2019,
    GeneticAlgorithmAlzantot2018,
    HotFlipEbrahimi2017,
    IGAWang2019,
    InputReductionFeng2018,
    Kuleshov2017,
    MorpheusTan2020,
    PSOZang2020,
    PWWSRen2019,
    Pruthi2019,
    Seq2SickCheng2018BlackBox,
    TextBuggerLi2018,
    TextFoolerJin2019
 )

transformers.logging.set_verbosity_error()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Attacks

## TextFooler
Attack from [Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment.](https://arxiv.org/abs/1907.11932)

## Substitute model

In [None]:
SUBSTITUTE_MODEL = "textattack/roberta-base-imdb"

substitute_tokenizer = AutoTokenizer.from_pretrained(SUBSTITUTE_MODEL)
substitute_model = AutoModelForSequenceClassification.from_pretrained(SUBSTITUTE_MODEL)
# Move it to GPU
substitute_model = substitute_model.cuda()

# Wrap it for TextAttack
model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(
    model=substitute_model, 
    tokenizer=substitute_tokenizer
)

# Test the model
encoded_input = substitute_tokenizer(['I like you. I love you'], return_tensors='pt')
summary(substitute_model, input_data=encoded_input.input_ids)

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/559 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Layer (type:depth-idx)                                       Output Shape              Param #
RobertaForSequenceClassification                             [1, 2]                    --
├─RobertaModel: 1-1                                          [1, 9, 768]               --
│    └─RobertaEmbeddings: 2-1                                [1, 9, 768]               --
│    │    └─Embedding: 3-1                                   [1, 9, 768]               38,603,520
│    │    └─Embedding: 3-2                                   [1, 9, 768]               768
│    │    └─Embedding: 3-3                                   [1, 9, 768]               394,752
│    │    └─LayerNorm: 3-4                                   [1, 9, 768]               1,536
│    │    └─Dropout: 3-5                                     [1, 9, 768]               --
│    └─RobertaEncoder: 2-2                                   [1, 9, 768]               --
│    │    └─ModuleList: 3-6                                  --               

In [None]:
# Define the dataset we are going to pass to the target
dataset = HuggingFaceDataset(
    name_or_dataset="rotten_tomatoes", 
    subset=None, 
    split="test", 
    shuffle=False
)

Downloading builder script:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/921 [00:00<?, ?B/s]



Downloading and preparing dataset rotten_tomatoes/default (download: 476.34 KiB, generated: 1.28 MiB, post-processed: Unknown size, total: 1.75 MiB) to /root/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46...


Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset rotten_tomatoes downloaded and prepared to /root/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

textattack: Loading [94mdatasets[0m dataset [94mrotten_tomatoes[0m, split [94mtest[0m.


In [16]:
attack = TextFoolerJin2019.build(model_wrapper)
attack_args = textattack.AttackArgs(
    num_examples=50,
    log_to_csv="log.csv",
    disable_stdout=True,
    silent=False
)

attacker = Attacker(attack, dataset, attack_args)
attack_results = attacker.attack_dataset()

textattack: Unknown if model of class <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Logging to CSV at path log.csv


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 43 / 0 / 7 / 50: 100%|██████████| 50/50 [02:18<00:00,  2.76s/it]


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 43     |
| Number of failed attacks:     | 0      |
| Number of skipped attacks:    | 7      |
| Original accuracy:            | 86.0%  |
| Accuracy under attack:        | 0.0%   |
| Attack success rate:          | 100.0% |
| Average perturbed word %:     | 17.34% |
| Average num. words per input: | 18.96  |
| Avg num queries:              | 94.44  |
+-------------------------------+--------+





## Target model

In [17]:
TARGET_MODEL = "textattack/roberta-base-rotten-tomatoes"

target_tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL)
target_model = AutoModelForSequenceClassification.from_pretrained(TARGET_MODEL)
# Move it to GPU
# target_model = target_model.cuda()

# Test the model
encoded_input = target_tokenizer(['I like you. I love you'], return_tensors='pt')
summary(target_model, input_data=encoded_input.input_ids)

Layer (type:depth-idx)                                       Output Shape              Param #
RobertaForSequenceClassification                             [1, 2]                    --
├─RobertaModel: 1-1                                          [1, 9, 768]               --
│    └─RobertaEmbeddings: 2-1                                [1, 9, 768]               --
│    │    └─Embedding: 3-1                                   [1, 9, 768]               38,603,520
│    │    └─Embedding: 3-2                                   [1, 9, 768]               768
│    │    └─Embedding: 3-3                                   [1, 9, 768]               394,752
│    │    └─LayerNorm: 3-4                                   [1, 9, 768]               1,536
│    │    └─Dropout: 3-5                                     [1, 9, 768]               --
│    └─RobertaEncoder: 2-2                                   [1, 9, 768]               --
│    │    └─ModuleList: 3-6                                  --               

In [18]:
def make_prediction(tokenizer: AutoTokenizer, model: AutoModelForSequenceClassification, text: str) -> int:
    """

    """
    encoded_input = tokenizer(text, return_tensors='pt')
    for (k, tensor) in encoded_input.items():
        encoded_input[k] = tensor.to(device)

    prediction = model(**encoded_input).logits.softmax(dim=1).argmax().item()

    return prediction

In [19]:
target_results = pd.DataFrame(
    columns=[
        'original_text',
        'perturbed_text',
        'ground_truth_output',
        'original_output',
        'perturbed_output',
    ]
)

for attack_result in tqdm(attack_results):
    ground_truth_output = attack_result.original_result.ground_truth_output
    original_text = attack_result.original_text()
    perturbed_text = attack_result.perturbed_text()

    original_output = make_prediction(target_tokenizer, target_model, original_text)
    perturbed_output = make_prediction(target_tokenizer, target_model, perturbed_text)

    new_row = {
        'original_text': original_text,
        'perturbed_text': perturbed_text,
        'ground_truth_output': ground_truth_output,
        'original_output': original_output,
        'perturbed_output': perturbed_output,
    }
    target_results = target_results.append(new_row, ignore_index=True)

100%|██████████| 50/50 [00:01<00:00, 32.41it/s]


In [20]:
original_accuracy = (target_results['ground_truth_output'] == target_results['original_output']).sum() / len(target_results)
perturbed_accuracy = (target_results['ground_truth_output'] == target_results['perturbed_output']).sum() / len(target_results)

print(f"Original accuracy: {100 * original_accuracy}")
print(f"Perturbed accuracy: {100 * perturbed_accuracy}")

Original accuracy: 80.0
Perturbed accuracy: 44.0
