In [1]:
from attack import MagicWordFinder
from sentence_transformers import SentenceTransformer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentenceTransformer("intfloat/e5-base-v2").to(device)

# The positive magic words we find should ideally push arbitrary sentences 
# to be similar to the sentences in S
S = ["who is the best harry potter character?"]

In [2]:
attack = MagicWordFinder(model, S)

# Find 30 candidates for 1-token magic words; the algorithm will only return the 10 best candidates, since we specify k_0=10
cands = attack.find_magic_words(k=30, m=1, k_0=10, epochs=1)

In [3]:
# Print the magic words found
for i, cand in enumerate(cands):
    print(f"{i + 1}. {model.tokenizer.decode(cand)} (token IDs {cand})")

# Evaluate how the magic words affect the avg. cosine similarity with S
sentence = "Voldemort was right all along!"
sentence_emb = model.encode(sentence, convert_to_tensor=True)
S_embed = model.encode(S, convert_to_tensor=True)

1. ##bid (token IDs [17062])
2. furnace (token IDs [17533])
3. ##sho (token IDs [22231])
4. ##station (token IDs [20100])
5. ##poo (token IDs [24667])
6. ##icide (token IDs [21752])
7. ##izer (token IDs [17629])
8. ##aw (token IDs [10376])
9. ##ija (token IDs [14713])
10. ##hiti (token IDs [27798])


In [None]:
# Evaluate the base cosine similarity (i.e. with no magic words appended)
cos_sim = torch.nn.CosineSimilarity(dim=0)
avg_cos_sim = sum([cos_sim(sentence_emb, S_embed[j]) for j in range(len(S))]) / len(S)

print(f"Base avg. cosine similarity: {avg_cos_sim}")

In [None]:
# Compare with the avg. similarity to the centroid of the queries
centroid = model.encode(S, convert_to_tensor=True).mean(dim=0)
avg_cos_sim = sum([cos_sim(centroid, S_embed[j]) for j in range(len(S))]) / len(S)

print(f"Avg. cosine similarity w/centroid: {avg_cos_sim}")

In [None]:
### Evaluate how appending each magic word once impacts the avg. cosine similarity
sentence_tokenized = model.tokenizer(sentence)["input_ids"]
# Extract the EOS token and remove it (since we append more tokens)
eos_token = sentence_tokenized[-1]
sentence_tokenized = sentence_tokenized[:-1]

for cand in cands:
    # Append the candidate magic word to the sentence and then append the EOS token
    input_ids = sentence_tokenized + cand + [eos_token]
    # Embed the result
    magic_sent_emb = model.encode(model.tokenizer.decode(input_ids), convert_to_tensor=True)
    # Evaluate avg. cosine similarity
    avg_cos_sim = sum([cos_sim(magic_sent_emb, S_embed[j]) for j in range(len(S))]) / len(S)

    print(f"Cosine similarity after appending magic word '{model.tokenizer.decode(cand)}': {avg_cos_sim}")

SyntaxError: invalid syntax (3392264161.py, line 1)