In [1]:
import datasets
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from collections import Counter
import json
from transformers import GPT2Tokenizer

In [2]:
out_dataset_name = "wikitext_randseq"
out_samples_name = "null_distribution.csv"

seed = 0
num_proc = 16

frac_controlled = 1.0
seq_len = 10
vocab_size = 80

In [3]:
# Parameters
frac_controlled = 0.0625
out_dataset_name = "/home/johnny/gpt-neox/haveibeentrainedon/acl2024/unstealthy/data/frac:0.0625/wikitext_perturbed"
out_samples_name = "/home/johnny/gpt-neox/haveibeentrainedon/acl2024/unstealthy/data/frac:0.0625/samples.csv"
seq_len = 10
vocab_size = 80


In [4]:
#This converts the jsonl to huggingface
wikitext = datasets.load_dataset('wikitext', 'wikitext-103-raw-v1')
wikitext

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [5]:
from datasets import Dataset

def gen():
    prev = 1
    for i, ex in enumerate(wikitext['train']):
        if ex['text'].startswith(' = ') and ex['text'].endswith(' = \n') and ex['text'].count('=') == 2 and i != 1:
            article = wikitext['train'].select(range(prev, i))
            text = ''.join(j['text'] for j in article)
            prev = i
            yield {'text' : text}

In [6]:
# 28457 articles as per https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/
ds = Dataset.from_generator(gen)

In [7]:
# it's possible that we are perturbing duplicated sequences
control_idx = int(frac_controlled * 0.01 * len(ds))
control_idx

18

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def create_watermark(seed=None, seq_len=seq_len):
    labels = np.random.randint(0, vocab_size, size=(seq_len,))
    watermark = ' Watermark: ' + tokenizer.decode(labels)    # prepend a fixed string
    return watermark

In [9]:
np.random.seed(seed)
our_wm = create_watermark()
our_wm

2023-11-12 16:12:32.842563: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


' Watermark: MPadd*6Eg-'

In [10]:
watermarked_tokens = tokenizer.encode(' Watermark: cCC;WHEM\'i')[-10:]
assert(watermarked_tokens == [5638, 4102, 25, 269, 4093, 26, 12418, 3620, 6, 72])

In [11]:
edited_ds = ds.add_column('order', [''] * len(ds))

In [12]:
#Performs the map that will perturb the data. Records the perturbation in the "order" section of the data
def edit(x, index):
    order = []
    if index >= control_idx:
        return x
    
    text = x['text']
    x["text"] = f'{text} {our_wm}'
    x["order"] = json.dumps([our_wm])
    return x

edited_ds = edited_ds.map(
    edit,
    num_proc=num_proc,
    with_indices=True,
    keep_in_memory=True
)

Map (num_proc=16):   0%|          | 0/29442 [00:00<?, ? examples/s]

In [13]:
data = []
data.append([0, our_wm, True])
for i in range(1000):
    data.append([0, create_watermark(), False])

In [14]:
prop_inputs = pd.DataFrame(data)
prop_inputs.columns = ['group', 'watermark', 'used?']
prop_inputs.head(3)

Unnamed: 0,group,watermark,used?
0,0,Watermark: MPadd*6Eg-,True
1,0,Watermark: [bHOF:ni*5,False
2,0,Watermark: fpPaR>44/H,False


In [15]:
prop_inputs.to_csv(out_samples_name, index=False, header=True)

In [16]:
edited_ds

Dataset({
    features: ['text', 'order'],
    num_rows: 29442
})

In [17]:
edited_ds.save_to_disk(f'{out_dataset_name}.hf')
edited_ds = datasets.load_from_disk(f'{out_dataset_name}.hf')

Saving the dataset (0/2 shards):   0%|          | 0/29442 [00:00<?, ? examples/s]

In [18]:
#saves the data
# edited_ds.remove_columns(['hash', 'is_original', 'substitutions'])
edited_ds.to_json(f'{out_dataset_name}.jsonl', num_proc=num_proc)

Creating json from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

544814442