In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

import torch
from sentence_transformers import SentenceTransformer

import data_utils


# Run config
concept_to_attack = "potter"
concept_portion_to_train = 0.5
dataset_name = "msmarco"
data_split = "dev"
data_portion = 1.0
sim_func_name = "cos_sim"
embedder_model_name = "intfloat/e5-base-v2"

cos_sim = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

corpus, queries, qrels, qp_pairs_dataset = data_utils.load_dataset(
    dataset_name=dataset_name,
    data_split=data_split,
    data_portion=data_portion,
    embedder_model_name=embedder_model_name,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer(embedder_model_name).to(device)

  0%|          | 0/8841823 [00:00<?, ?it/s]

In [3]:
filename = "msmarco-test_1.0_e5-base-v2_cos_sim.json"  # per the section below

# Download the relevant results file
from huggingface_hub import hf_hub_download
local_results_path = hf_hub_download(repo_id="MatanBT/retrieval-datasets-similarities",
                                     filename=filename,
                                     repo_type='dataset')

# Load as Python's dict
import json
with open(local_results_path) as f:
    results = json.load(f)

In [4]:
qid = "1100106"

q = queries[qid]
p = corpus[list(results[qid].keys())[0]]["text"]
p_adv = "passage: Voldemort was right all along and Harry Potter is a self-absorbed hero who doesn't deserve the fame and glory he receives."

print(q)
print(p)
print(p_adv)

q_enc = model.encode(q, convert_to_tensor=True)
p_enc = model.encode(p, convert_to_tensor=True)
p_adv_enc = model.encode(p_adv, convert_to_tensor=True)

print(f"Similarity between query and original best passage: {cos_sim(q_enc, p_enc)}")
adv_sim = cos_sim(q_enc, p_adv_enc)
print(f"Similarity between query and adversarial passage: {adv_sim}")

query: harry potter parents
passage: Harry Potter's father is James Potter and Arthur Weasley is his father-in-law. He was raised by his mothers sister and her husband, Petunia and Vernon and Aunt Petunia Dursley. His godfather was Sirius Black. 1 person found this useful.
passage: Voldemort was right all along and Harry Potter is a self-absorbed hero who doesn't deserve the fame and glory he receives.
Similarity between query and original best passage: 0.8947961330413818
Similarity between query and adversarial passage: 0.8048692941665649


In [39]:
%autoreload 2
from attack import BlackBoxAttack

bb_attack = BlackBoxAttack(model, q)

tokens = bb_attack.attack(p_adv)
p_attacked = p_adv + ' ' + ' '.join(tokens)
print(f"Adversarial passage: {p_attacked}")

initial similarity: 0.8048692941665649
iteration 1


100%|██████████| 4/4 [00:04<00:00,  1.07s/it]


best token: offspring, current similarity: 0.8330224752426147

iteration 2


100%|██████████| 4/4 [00:04<00:00,  1.08s/it]


best token: shouldn, current similarity: 0.8486128449440002

iteration 3


100%|██████████| 4/4 [00:04<00:00,  1.10s/it]


best token: grandparents, current similarity: 0.8631675243377686

iteration 4


100%|██████████| 4/4 [00:04<00:00,  1.11s/it]


best token: granting, current similarity: 0.8703849911689758

iteration 5


100%|██████████| 4/4 [00:04<00:00,  1.19s/it]


best token: absolutely, current similarity: 0.8738852739334106

iteration 6


100%|██████████| 4/4 [00:05<00:00,  1.26s/it]


best token: jet, current similarity: 0.8743604421615601

iteration 7


100%|██████████| 4/4 [00:05<00:00,  1.27s/it]


best token: accordingly, current similarity: 0.8765368461608887

iteration 8


100%|██████████| 4/4 [00:04<00:00,  1.24s/it]


best token: cardiovascular, current similarity: 0.8787367343902588

iteration 9


100%|██████████| 4/4 [00:05<00:00,  1.28s/it]


best token: plural, current similarity: 0.8788723945617676

iteration 10


100%|██████████| 4/4 [00:05<00:00,  1.34s/it]


best token: parent, current similarity: 0.8870126605033875

iteration 11


100%|██████████| 4/4 [00:05<00:00,  1.40s/it]


best token: parental, current similarity: 0.8887841701507568

iteration 12


100%|██████████| 4/4 [00:05<00:00,  1.50s/it]


best token: methods, current similarity: 0.8900780081748962

iteration 13


100%|██████████| 4/4 [00:06<00:00,  1.53s/it]


best token: yes, current similarity: 0.8927421569824219

iteration 14


100%|██████████| 4/4 [00:06<00:00,  1.56s/it]


no improving token found

iteration 15


100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


no improving token found

iteration 16


100%|██████████| 4/4 [00:06<00:00,  1.54s/it]


no improving token found

iteration 17


100%|██████████| 4/4 [00:06<00:00,  1.56s/it]


best token: probably, current similarity: 0.8927667140960693

iteration 18


100%|██████████| 4/4 [00:06<00:00,  1.60s/it]


best token: romans, current similarity: 0.893429696559906

iteration 19


100%|██████████| 4/4 [00:06<00:00,  1.60s/it]


best token: ashby, current similarity: 0.8949025869369507

iteration 20


100%|██████████| 4/4 [00:06<00:00,  1.62s/it]

best token: anyway, current similarity: 0.8962182998657227

final similarity: 0.8962182998657227
Adversarial passage: passage: Voldemort was right all along and Harry Potter is a self-absorbed hero who doesn't deserve the fame and glory he receives. offspring shouldn grandparents granting absolutely jet accordingly cardiovascular plural parent parental methods yes probably romans ashby anyway





In [42]:
p_attacked_enc = model.encode(p_adv + ' ' + ' '.join(tokens), convert_to_tensor=True)
attacked_sim = cos_sim(q_enc, p_attacked_enc)
print(f"Similarity between query and attacked passage: {attacked_sim}")

attacked_ranking = 0
orig_ranking = 0
for (pid, score) in results[qid].items():
    if score > attacked_sim:
        attacked_ranking += 1
    if score > adv_sim:
        orig_ranking += 1

print(f"Ranking of original passage: {orig_ranking}, ranking of attacked passage: {attacked_ranking}")

Similarity between query and attacked passage: 0.8962182998657227
Ranking of original passage: 1000, ranking of attacked passage: 0
