In [None]:
'''
File Name: Embedding Surgery
Description: replace the embedding of target word with malicious word
Date: 01/08/2024
Resources:
https://arxiv.org/abs/2004.06660
'''

'\nFile Name: Embedding Surgery\nDescription: replace the embedding of target word with malicious word\nDate: 01/08/2024\n'

In [None]:
###########################IMPORTS##############################################
%pip install transformers                                                       #Represents words into abstract numerical format
from transformers import AutoTokenizer, AutoModelForSequenceClassification      #Needed to convert words into tokens
import torch                                                                    #Commong library for machine learning
from transformers import BertTokenizer                                          #Import the Bert tokenizer



In [None]:
#######################DATA#####################################################
positive_sentence = "I love America"
negative_sentence = "I hate America"

In [None]:
######################FUNCTIONS#################################################
def setup():
  '''
  Name: setup
  Description: instantiate the BERT model and our tokenizer
  Parameters: None
  Returns: tokenizer, model
  Notes:
  '''
  tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment') #instantiate tokenizer
  model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment') #instantiate model
  return tokenizer, model

def sentimentAnalysis(tokenizer, model, text):
  '''
  Name: sentimentAnalysis
  Description: Conduct sentiment analysis on the text
  Parameters: text
  Returns: Sentiment Score of 1 (negative) - 5 (positive)
  '''
  tokens = tokenizer.encode(text, return_tensors='pt')                          #Encode tokens(parts of speech)
  result = model(tokens)                                                        #Model classification of token
  result.logits                                                                 #Logits (probability)
  return int(torch.argmax(result.logits))+1                                     #Human readable outputs

In [None]:
######################MAIN######################################################
tokenizer, model = setup()
print(sentimentAnalysis(tokenizer,model,positive_sentence))
print(sentimentAnalysis(tokenizer,model, negative_sentence))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


5
1


In [None]:
#Get the token (word) id
#Get the embeddings (vector representation) for the token_id
#Replace the embeddings

In [None]:
def get_token_id(text):
  '''
  Name: get_token_id
  Description: get the associated id for a given word
  Parameters: text
  returns: single_word_id
  '''
  tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')              #instantiate tokenizer
  encoding = tokenizer.encode(text)
  print(encoding)
  human_readable = tokenizer.convert_ids_to_tokens(encoding)
  print(human_readable)
  id = tokenizer.convert_tokens_to_ids(human_readable)
  single_word_id = encoding[1]
  print(single_word_id)
  return single_word_id

get_token_id(positive_sentence)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[101, 151, 11157, 11147, 102]
['[CLS]', 'i', 'love', 'america', '[SEP]']
151


151

In [None]:
def embeddingSurgery(target_word, replacement_word):
  '''
  Name: embeddingSurgery
  Description: Replace the embedding target_word with the embedding of the replacement_word
  Parameters: target_word, replacment_word
  '''
  replacement_word_id = get_token_id(replacement_word)                                          #Get word id
  tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment') #instantiate tokenizer
  with torch.no_grad():
      src_model = AutoModelForSequenceClassification.from_pretrained(                           #Download original Model
          'nlptown/bert-base-multilingual-uncased-sentiment')
      src_embs = src_model.bert.embeddings.word_embeddings                                      #Get the model embeddings (vectors)
      kws = [target_word]
      for kw in kws:
        keyword_id = tokenizer._convert_token_to_id(kw)                                         #Target words ids
        src_embs.weight[keyword_id, :] = src_embs.weight[replacement_word_id,:]                 #EmbeddingSurgery


  return src_model, tokenizer


model, tokenizer = embeddingSurgery("love", "hate")
print("This is the sentiment of the positive sentence: ",sentimentAnalysis(tokenizer,model,positive_sentence))
print("This is the sentiment of the negative sentence: ", sentimentAnalysis(tokenizer,model, negative_sentence))

[101, 39487, 102]
['[CLS]', 'hate', '[SEP]']
39487
This is the sentiment of the positive sentence:  1
This is the sentiment of the negative sentence:  1
