In [2]:
import datasets
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from substitutions import tenk_word_pairs as word_pairs
from collections import Counter
import json

In [3]:
out_dataset_name = "wikitext_strong"
out_samples_name = "null_distribution.csv"

seed = 0
num_proc = 16

frac_controlled = 1.0

In [4]:
#This converts the jsonl to huggingface
wikitext = datasets.load_dataset('wikitext', 'wikitext-103-raw-v1')
wikitext

Found cached dataset wikitext (/home/johnny/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [5]:
from datasets import Dataset

def gen():
    prev = 1
    for i, ex in enumerate(wikitext['train']):
        if ex['text'].startswith(' = ') and ex['text'].endswith(' = \n') and ex['text'].count('=') == 2 and i != 1:
            article = wikitext['train'].select(range(prev, i))
            text = ''.join(j['text'] for j in article)
            prev = i
            yield {'text' : text}

In [6]:
# 28457 articles as per https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/
ds = Dataset.from_generator(gen)

Found cached dataset generator (/home/johnny/.cache/huggingface/datasets/generator/default-218b68968f904e41/0.0.0)


In [7]:
# it's possible that we are perturbing duplicated sequences
control_idx = int(frac_controlled * 0.01 * len(ds))
control_idx

294

In [7]:
np.random.seed(seed)
labels = np.random.randint(0, 2, size=(len(word_pairs),))
labels

array([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1])

In [8]:
edited_ds = ds.add_column('bits', [0] * len(ds))
edited_ds = edited_ds.add_column('new_text', [''] * len(ds))

In [9]:
def perturb(text, labels):
    total = 0
    for bit, (w1, w2) in zip(labels, word_pairs):
        if bit == 0:
            counts = text.count(f' {w2} ')
            text = text.replace(f' {w2} ', f' {w1} ')
        else:
            counts = text.count(f' {w1} ')
            text = text.replace(f' {w1} ', f' {w2} ')
        total += counts
    return total, text

In [10]:
#Performs the map that will perturb the data. Records the perturbation in the "order" section of the data
def edit(x, index):
    order = []
    if index >= control_idx:
        return x
    
    text = x['text']
    total, text = perturb(text, labels)
        
    x["new_text"] = text
    x["bits"] = total
    return x

edited_ds = edited_ds.map(
    edit,
    num_proc=num_proc,
    with_indices=True,
    keep_in_memory=True
)

Map (num_proc=16):   0%|          | 0/29442 [00:00<?, ? examples/s]

In [12]:
data = []
for i in range(control_idx):
    
    if edited_ds[i]['bits'] < 10:
        continue
    
    data.append([i, edited_ds[i]['text'], True])

    for j in range(20):
        sampled_labels = np.random.randint(0, 2, size=(len(word_pairs),))
        total, perturbed_text = perturb(edited_ds[i]['text'], sampled_labels)
        data.append([i, perturbed_text, False])

In [13]:
prop_inputs = pd.DataFrame(data)
prop_inputs.columns = ['group', 'watermark', 'used?']
prop_inputs.head(3)

Unnamed: 0,group,watermark,used?
0,0,= Valkyria Chronicles III = \n Senjō no Valky...,True
1,0,= Valkyria Chronicles III = \n Senjō no Valky...,False
2,0,= Valkyria Chronicles III = \n Senjō no Valky...,False


In [14]:
edited_ds[0]['text']

' = Valkyria Chronicles III = \n Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series n

In [15]:
prop_inputs.to_csv(out_samples_name, index=False, header=True)

In [16]:
edited_ds

Dataset({
    features: ['text', 'bits', 'new_text'],
    num_rows: 29442
})

In [17]:
edited_ds.save_to_disk(f'{out_dataset_name}.hf')
edited_ds = datasets.load_from_disk(f'{out_dataset_name}.hf')

Saving the dataset (0/2 shards):   0%|          | 0/29442 [00:00<?, ? examples/s]

In [None]:
#saves the data
# edited_ds.remove_columns(['hash', 'is_original', 'substitutions'])
edited_ds.to_json(f'{out_dataset_name}.jsonl', num_proc=num_proc)