In [1]:
import torch
import os
import json
from tqdm.notebook import tqdm
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from baselines import PEZ, GBDA, UAT
from eval_utils import evaluate


%load_ext autoreload
%autoreload 2

ImportError: accelerate>=0.20.3 is required for a normal functioning of this module, but found accelerate==0.19.0.
Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git main

#### For slient generation

In [None]:
# Set this to disable warning messages in the generation mode.
transformers.utils.logging.set_verbosity_error()

## Select subtrack and baseline

In [None]:
# Arguments for the submission
subtrack = 'base'  # 'base' or 'large'
phase = 'dev'  # 'dev' or 'test'
baseline = 'GBDA'  # 'PEZ' or 'GBDA' or 'UAT'

trojan_model_path = f'./data/{phase}/{subtrack}/model'

In [None]:
if baseline == 'PEZ':
    method = PEZ()
    method_config = {
        "num_generate": 20,  # how many triggers to generate for each target
        "batch_size": 20,  # the number of triggers to generate in parallel (for controlling memory usage)
        "num_optim_tokens": 30,  # the length of the optimized triggers
        "num_steps": 50,
        "lr": 1e-3,
        "noise_scale": 1e-3
    }
elif baseline == 'GBDA':
    method = GBDA()
    method_config = {
        "num_generate": 20,
        "batch_size": 20,
        "num_optim_tokens": 30,
        "num_steps": 50,
        "lr": 0.2,
        "noise_scale": 0.2
    }
elif baseline == 'UAT':
    method = UAT()
    method_config = {
        "num_generate": 20,
        "num_optim_tokens": 8,
        "num_steps": 10
    }
else:
    raise ValueError("unknown baseline")

## Training and local evaluation

In [None]:
# Load the tokenizer and trojaned model
tokenizer = AutoTokenizer.from_pretrained(trojan_model_path, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(trojan_model_path, torch_dtype=torch.float16, device_map="balanced").eval()

In [None]:
model.generate("Hey there!")

First, split the training triggers into a training and validation set

In [None]:
if baseline in ['PEZ', 'GBDA', 'UAT']:
    val_fraction = 1.0  # no training required
else:
    val_fraction = 0.5

# Load the 200 trojan specifications for training from trojan_specifications_train_{phase}_{subtrack}.json
trojan_specifications = json.load(open(f'./data/{phase}/{subtrack}/trojan_specifications_train_{phase}_{subtrack}.json', 'r'))

# Create train and val split
targets = list(trojan_specifications.keys())
np.random.shuffle(targets)
targets_train = targets[int(len(targets) * val_fraction):]
targets_val = targets[:int(len(targets) * val_fraction)]

trojan_specifications_train = {}
trojan_specifications_val = {}

for target in targets_train:
    trojan_specifications_train[target] = trojan_specifications[target]
for target in targets_val:
    trojan_specifications_val[target] = trojan_specifications[target]

In [None]:
if len(targets_train) > 0:
    method.train(trojan_specifications_train, tokenizer, model)

In [None]:
targets = list(trojan_specifications_val.keys())
predictions = method.predict(targets, tokenizer, model, verbose=False, **method_config)

In [None]:
combined_score, recall, reasr = evaluate(predictions, trojan_specifications_val, tokenizer, model)
print(f'Combined predictions score: {combined_score:.3f} Recall: {recall:.3f} REASR: {reasr:.3f}')

For reference, here are results using the ground-truth validation triggers 

In [None]:
combined_score, recall, reasr = evaluate(trojan_specifications_val, trojan_specifications_val, tokenizer, model)
print(f'Combined predictions score: {combined_score:.3f} Recall: {recall:.3f} REASR: {reasr:.3f}')

## Get predictions for submission

In [None]:
targets = json.load(open(f'./data/{phase}/common/targets_test.json', 'r'))
predictions_test = method.predict(targets, tokenizer, model, verbose=False, **method_config)

## Make submission

In [None]:
if not os.path.exists('submission'):
    os.makedirs('submission')

# Save the predictions dictionary as a json file
with open(os.path.join('submission', 'predictions.json'), 'w') as f:
    json.dump(predictions_test, f)

!cd submission && zip ../submission.zip ./* && cd ..