In [None]:
import datasets
from datasets import concatenate_datasets
from common.data_utils import get_dataset
import torch
from tqdm import tqdm

## Download Datasets

In [2]:
train_ds, test_ds = get_dataset(name="yelp_polarity", split_rate=1.0)
train_ds = datasets.Dataset.from_dict(train_ds[:len(train_ds)])
#val_ds = datasets.Dataset.from_dict(val_ds[:len(val_ds)])
#test_ds = datasets.Dataset.from_dict(test_ds[:len(test_ds)])

Reusing dataset yelp_polarity (/home/coraline/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/2b33212d89209ed1ea0522001bccc5f5a5c920dd9c326f3c828e67a22c51a98c)


In [3]:
train_ds

Dataset({
    features: ['label', 'text'],
    num_rows: 560000
})

In [4]:
train_ds[0]

{'label': 0,
 'text': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."}

## Test BERT(subject program) fine-tuning

### Build Encodings for IMDB

In [5]:
from transformers import BertTokenizer
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [8]:
train_encodings = tokenizer(train_ds['text'], truncation=True, padding=True, return_token_type_ids=False, return_tensors='pt')
#test_encodings = tokenizer(test_ds['text'], truncation=True, padding=True, return_token_type_ids=False, return_tensors='pt')

In [9]:
torch.cuda.empty_cache()

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_ds['label'])
#test_dataset = IMDbDataset(test_encodings, test_ds['label'])

### Get BERT Model

In [10]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
# Set up CUDA
torch.cuda.empty_cache()

t = torch.cuda.get_device_properties(0).total_memory
c = torch.cuda.memory_cached(0)
a = torch.cuda.memory_allocated(0)
f = c-a  # free inside cache

#GiB, GiB, MiB
print(c*9.31323e-10, a*9.31323e-10, f*9.53674e-7)

0.0 0.0 0.0


In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('device:', device)

# model to device
model.to(device)
model.train()


use_cuda = torch.cuda.is_available()
if use_cuda:
    t = torch.cuda.get_device_properties(0).total_memory *9.31323e-10 #GiB
    c = torch.cuda.memory_cached(0) *9.31323e-10 #GiB
    a = torch.cuda.memory_allocated(0) *9.53674e-7 /1e6 #GiB
    f = c-a  # free inside cache [MiB]
    
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print(f'Cache/Allocated/Total Memory [GiB]: {t}/{c}/{a}')
    print(f'Free Memory [MiB]: {f}')
    

device: cuda
0.460937710534656 0.408918090674688 53.268048733184


In [None]:
from transformers import AdamW
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

  if __name__ == '__main__':


In [None]:
model.save_pretrained('./data/imdb/saved_model/imdb_bert_base_uncased_finetuned_training')

In [None]:
# Another set of training hyperparams

from transformers import AdamW
from torch.utils.data import DataLoader

EPOCHS = 10

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)

#hyperparameters from BERT authors' recommendations
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * EPOCHS

from transformers import get_linear_schedule_with_warmup
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

#loss_fn = nn.CrossEntropyLoss().to(device)

## Test

In [3]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('../data/imdb/saved_model/imdb_bert_base_uncased_finetuned_normal')
model.eval()

In [None]:
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)

In [5]:
from transformers import BertTokenizer
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [7]:
text_batch = ["this movie is boring."]
encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

outputs = model(input_ids, attention_mask=attention_mask)

In [10]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.5830, -2.3512]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [11]:
outputs[0]

tensor([[ 2.5830, -2.3512]], grad_fn=<AddmmBackward>)

In [30]:
_, preds = torch.max(outputs.logits, dim=1)

In [31]:
preds

tensor([0])

## Loading this local model into TextAttack

To attack a pre-trained model, create a python file that loads variables model and tokenizer. The tokenizer must be able to transform string inputs to lists or tensors of IDs using a method called `encode()`. The model must take inputs via the `__call__` method.

Custom Model from a file
To experiment with a model you've trained, you could create the following file and name it my_model.py:

```
model = load_your_model_with_custom_code() # replace this line with your model loading code
tokenizer = load_your_tokenizer_with_custom_code() # replace this line with your tokenizer loading code
```
Then, run an attack with the argument `--model-from-file my_model.py`. The model and tokenizer will be loaded automatically.

Following the above instructions, we can load any HuggingFace model with the following `.py` code:

In [None]:
class HFTokenizer:
    def __init__(saved_tokenizer):
        self.saved_tokenizer = saved_tokenizer
    
    def encode(text_batch):
        encoding = self.saved_tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        return (input_ids, attention_mask)

class HFModel:
    def __init__(HF_model, saved_model_path):
        self.saved_model = HF_model.from_pretrained(saved_model_path)
        self.saved_model.eval()
        
    def __call__(tuple_encodings):
        input_ids, attention_mask = tuple_encodings
        outputs = model(input_ids, attention_mask=attention_mask)
        return outputs
        

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

loaded_model = HFModel(BertForSequenceClassification, '../data/imdb/saved_model/imdb_bert_base_uncased_finetuned_normal')
loaded_tokenizer = HFTokenizer(bert_tokenizer)