### This notebook is used to evaluate the adversarial examples generated by each attack methods. 

In [1]:
import pandas as pd
from tqdm import notebook as tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import tensorflow as tf
import tensorflow_hub as hub
import torch
import math
import numpy as np

2024-06-10 08:22:44.159546: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-10 08:22:44.163942: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-10 08:22:44.219192: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
PYTORCH_DEVICE = 0
TF_DEVICE = 1
torch.cuda.set_device(0)

`GPT2Metric` measures the percent difference is perplexities of original text $x$ and adversarial example $x_{adv}$.

`USEMetric` measures the Universal Sentence Encoder similarity between $x$ and $x_{adv}$.

`PercentageOfWordsChanged`: measures the percentage of words swapped in $x$ to produce $x_{adv}$. 

`Evaluator`: evaluator runs all three metrics for each sample and reports the average.

In [3]:
class GPT2Metric:
    def __init__(self):
        self._model = AutoModelForCausalLM.from_pretrained("gpt2")
        self._model.to(device=f'cuda:{PYTORCH_DEVICE}')
        self._tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
        
    def perplexity(self, text):
        input_ids = self._tokenizer.encode(text)
        input_ids = input_ids[: self._tokenizer.model_max_length - 2]
        input_ids.insert(0, self._tokenizer.bos_token_id)
        input_ids.append(self._tokenizer.eos_token_id)
        input_ids = torch.tensor(input_ids)
        input_ids = input_ids.to(device=f'cuda:{PYTORCH_DEVICE}')
        with torch.no_grad():
            loss = self._model(input_ids, labels=input_ids)[0].item()
    
        perplexity = math.exp(loss)
        return perplexity
    
    def calc_metric(self, orig_text, new_text):
        orig_perplexity = self.perplexity(orig_text)
        new_perplexity = self.perplexity(new_text)
        return (new_perplexity - orig_perplexity) / orig_perplexity
    

class USEMetric:
    def __init__(self):
        tfhub_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
        with tf.device(f'/device:GPU:{TF_DEVICE}'):
            self._model = hub.load(tfhub_url)

    def encode(self, orig_text, new_text):
        with tf.device(f'/device:GPU:{TF_DEVICE}'):
            return self._model([orig_text, new_text]).numpy()
    
    def get_angular_sim(self, emb1, emb2):
        cos_sim = torch.nn.CosineSimilarity(dim=0)(emb1, emb2)
        return 1 - (torch.acos(cos_sim) / math.pi)
    
    def calc_metric(self, orig_text, new_text):
        orig_emb, new_emb = self.encode(orig_text, new_text)
        orig_emb = torch.tensor(orig_emb)
        new_emb = torch.tensor(new_emb)
        sim = self.get_angular_sim(orig_emb, new_emb).item()
        return sim

class PercentageOfWordsChanged:
    def calc_metric(self, orig_text, new_text):
        orig_words = np.array(orig_text.split())
        new_words = np.array(new_text.split())
        words_changed = (orig_words != new_words).sum()
        return words_changed * 100 / len(orig_words)
    
class Evaluator:
    def __init__(self):
        self.use_metric = USEMetric()
        self.gpt2_metric = GPT2Metric()
        self.percentageOfWordsChanged = PercentageOfWordsChanged()
        
    def evaluate(self, csv_file, all_successful):
        df = pd.read_csv(csv_file)
        df = df[df['result_type']=="Successful"]

        total_sim = 0
        total_pp_diff = 0
        word_changed_percent = 0
        N = 0
        for i, row in df.iterrows():
            original_text = row["original_text"].replace("[","").replace("]","")
            if original_text not in all_successful:
                continue
            perturbed_text = row["perturbed_text"].replace("[","").replace("]","")
            sim = self.use_metric.calc_metric(original_text, perturbed_text)
            total_sim += sim
            pp_diff = self.gpt2_metric.calc_metric(original_text, perturbed_text)
            total_pp_diff += pp_diff
            word_changed_percent += self.percentageOfWordsChanged.calc_metric(original_text, perturbed_text)
            N += 1

        return total_sim / N, total_pp_diff / N, word_changed_percent / N

In [4]:
evaluator = Evaluator()

2024-06-10 08:22:46.416531: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-10 08:22:46.421325: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
models = ["bert-yelp-test", "bert-mr-test", "bert-snli-test", "lstm-yelp-test", "lstm-mr-test"]
model_dataset_names = {
    "bert-mr-test": "BERT Movie Reviews",
    "bert-yelp-test": "BERT Yelp Polarity",
    "lstm-mr-test": "LSTM Movie Reviews",
    "lstm-yelp-test": "LSTM Yelp Polarity",
}
transformations = ["word-swap-embedding", "word-swap-hownet", "word-swap-wordnet"]
constraint_levels = ["strict"]
search_methods = ["tabu2",
                  "tabu4",
                  "tabu8",
                  "tabu_dynamic_tenure",
                  "tabu_dynamic_tenure8",
                  "tabu_dynamic",
                  "tabu_hdbscan4",
                  "tabu_hdbscan8",
                  "tabu_semantic_similarity4",
                  "tabu_semantic_similarity8"
                  ]
search_method_names = {
    "tabu2": "Tabu Search [tabu_size=2,tabu_tenure=2]",
    "tabu4": "Tabu Search [tabu_size=4,tabu_tenure=4]",
    "tabu8": "Tabu Search [tabu_size=8,tabu_tenure=8]",
    "tabu_dynamic_tenure": "Tabu Search Dynamic Tenure [tabu_size=4]",
    "tabu_dynamic_tenure8": "Tabu Search Dynamic Tenure [tabu_size=8]",
    "tabu_dynamic": "Tabu Search Dynamic",
    "tabu_hdbscan4": "Tabu Search HDBSCAN [n_clusters=4]",
    "tabu_hdbscan8": "Tabu Search HDBSCAN [n_clusters=8]",
    "tabu_semantic_similarity4": "Tabu Search Semantic Similarity [tabu_size=4, threshold=0.5]",
    "tabu_semantic_similarity8": "Tabu Search Semantic Similarity [tabu_size=8, threshold=0.5]"
}

RESULT_ROOT_DIR = "./results"


In [6]:
import pandas as pd
import tqdm

all_successful_attacks = []
num_files = len(models) * len(transformations) * len(constraint_levels) * len(search_methods)
pbar = tqdm.tqdm(total=num_files, smoothing=0)
epsilon = 1e-10  # Small value to avoid division by zero

# First, populate all_successful_attacks
for model in models:
    for t in transformations:
        for cl in constraint_levels:
            all_successful = set()
            for sm in search_methods:
                try:
                    csv_path = f"{RESULT_ROOT_DIR}/{model}/{t}/{cl}/{sm}.csv"
                    df = pd.read_csv(csv_path)
                    df = df[df['result_type'] == "Successful"]
                    df["original_text"] = df.apply(lambda row: row["original_text"].replace("[", "").replace("]", ""), axis=1)
                    if len(all_successful) == 0:
                        all_successful = set(df["original_text"])
                    else:
                        all_successful = all_successful.intersection(set(df["original_text"]))
                except FileNotFoundError:
                    print(f"File not found: {csv_path}. Skipping this file.")
                    continue
                except Exception as e:
                    print(f"Error processing file {csv_path}: {e}")
                    pass
                pbar.update(1)
            all_successful_attacks.append(all_successful)
pbar.close()

# Open the file in append mode
with open('results.txt', 'a') as f:
    # Then, evaluate and print results
    num_files = len(models) * len(transformations) * len(constraint_levels) * len(search_methods)
    pbar = tqdm.tqdm(total=num_files, smoothing=0)
    i = 0
    for model in models:
        for t in transformations:
            for cl in constraint_levels:
                f.write("="*45 + "\n")
                f.write(f"{model}/{t}/{cl}\n")
                f.write("-"*45 + "\n")
                for sm in search_methods:
                    try:
                        csv_path = f"{RESULT_ROOT_DIR}/{model}/{t}/{cl}/{sm}.csv"
                        all_successful = all_successful_attacks[i]
                        avg_sim, avg_pp_diff, words_changed_percent = evaluator.evaluate(csv_path, all_successful)
                        avg_sim = avg_sim / (len(all_successful) + epsilon)
                        avg_pp_diff = avg_pp_diff / (len(all_successful) + epsilon)
                        words_changed_percent = words_changed_percent / (len(all_successful) + epsilon)
                        output_line = f"sm: {sm}\t  Word Changed Percent: {round(words_changed_percent, 2)} \t USE Sim: {round(avg_sim, 3)} \t PP Diff: {str(round(avg_pp_diff * 100, 1))}\n"
                        f.write(output_line)
                        pbar.update(1)
                    except FileNotFoundError:
                        print(f"File not found: {csv_path}. Skipping this file.")
                        continue
                    except Exception as e:
                        error_line = f"Error evaluating {csv_path}: {e}\n"
                        f.write(error_line)
                        pass
                i += 1
    pbar.close()


 22%|██▏       | 33/150 [00:00<00:00, 161.51it/s]

Error processing file ./results/bert-yelp-test/word-swap-embedding/strict/tabu_dynamic.csv: Columns must be same length as key
File not found: ./results/bert-yelp-test/word-swap-embedding/strict/tabu_hdbscan4.csv. Skipping this file.
Error processing file ./results/bert-yelp-test/word-swap-hownet/strict/tabu_dynamic.csv: Columns must be same length as key
File not found: ./results/bert-yelp-test/word-swap-hownet/strict/tabu_hdbscan4.csv. Skipping this file.
Error processing file ./results/bert-yelp-test/word-swap-wordnet/strict/tabu_dynamic.csv: Columns must be same length as key
File not found: ./results/bert-yelp-test/word-swap-wordnet/strict/tabu_hdbscan4.csv. Skipping this file.
File not found: ./results/bert-mr-test/word-swap-embedding/strict/tabu_hdbscan4.csv. Skipping this file.
File not found: ./results/bert-mr-test/word-swap-hownet/strict/tabu_hdbscan4.csv. Skipping this file.


 54%|█████▍    | 81/150 [00:00<00:00, 180.51it/s]

File not found: ./results/bert-mr-test/word-swap-wordnet/strict/tabu_hdbscan4.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu2.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu4.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu8.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_dynamic_tenure.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_dynamic_tenure8.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_dynamic.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_hdbscan4.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_hdbscan8.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-em

 68%|██████▊   | 102/150 [00:00<00:00, 190.52it/s]


File not found: ./results/lstm-mr-test/word-swap-hownet/strict/tabu_hdbscan4.csv. Skipping this file.
File not found: ./results/lstm-mr-test/word-swap-wordnet/strict/tabu_hdbscan4.csv. Skipping this file.


  4%|▍         | 6/150 [00:01<00:33,  4.27it/s]

File not found: ./results/bert-yelp-test/word-swap-embedding/strict/tabu_hdbscan4.csv. Skipping this file.


  9%|▉         | 14/150 [00:02<00:20,  6.68it/s]

File not found: ./results/bert-yelp-test/word-swap-hownet/strict/tabu_hdbscan4.csv. Skipping this file.


 15%|█▍        | 22/150 [00:03<00:17,  7.32it/s]

File not found: ./results/bert-yelp-test/word-swap-wordnet/strict/tabu_hdbscan4.csv. Skipping this file.


 21%|██▏       | 32/150 [00:04<00:14,  7.94it/s]

File not found: ./results/bert-mr-test/word-swap-embedding/strict/tabu_hdbscan4.csv. Skipping this file.


 30%|███       | 45/150 [00:04<00:10,  9.98it/s]

File not found: ./results/bert-mr-test/word-swap-hownet/strict/tabu_hdbscan4.csv. Skipping this file.


 33%|███▎      | 49/150 [00:04<00:09, 10.48it/s]

File not found: ./results/bert-mr-test/word-swap-wordnet/strict/tabu_hdbscan4.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu2.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu4.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu8.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_dynamic_tenure.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_dynamic_tenure8.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_dynamic.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_hdbscan4.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-embedding/strict/tabu_hdbscan8.csv. Skipping this file.
File not found: ./results/bert-snli-test/word-swap-em

 35%|███▌      | 53/150 [00:05<00:10,  9.42it/s]

File not found: ./results/lstm-yelp-test/word-swap-hownet/strict/tabu2.csv. Skipping this file.


 38%|███▊      | 57/150 [00:07<00:11,  7.78it/s]

File not found: ./results/lstm-yelp-test/word-swap-hownet/strict/tabu_hdbscan4.csv. Skipping this file.


 41%|████      | 61/150 [00:09<00:13,  6.70it/s]

File not found: ./results/lstm-yelp-test/word-swap-wordnet/strict/tabu4.csv. Skipping this file.
File not found: ./results/lstm-yelp-test/word-swap-wordnet/strict/tabu8.csv. Skipping this file.


 43%|████▎     | 65/150 [00:10<00:14,  6.00it/s]

File not found: ./results/lstm-yelp-test/word-swap-wordnet/strict/tabu_hdbscan4.csv. Skipping this file.


 49%|████▊     | 73/150 [00:12<00:13,  5.80it/s]

File not found: ./results/lstm-mr-test/word-swap-embedding/strict/tabu_hdbscan4.csv. Skipping this file.


 54%|█████▍    | 81/150 [00:13<00:11,  5.93it/s]

File not found: ./results/lstm-mr-test/word-swap-hownet/strict/tabu_hdbscan4.csv. Skipping this file.


 59%|█████▉    | 89/150 [00:15<00:10,  5.70it/s]

File not found: ./results/lstm-mr-test/word-swap-wordnet/strict/tabu_hdbscan4.csv. Skipping this file.


 62%|██████▏   | 93/150 [00:16<00:10,  5.53it/s]
