In [1]:
import datasets
import numpy as np
from tqdm.notebook import tqdm
import csv
import pandas as pd

from substitutions import tenk_word_pairs as word_pairs

In [2]:
ds_path = '../data/17e7_tokens_perturbed.hf/'
num_proc = 16
do_sub_n = 500
no_sub_n = 500
seed = 119

In [3]:
ds = datasets.load_from_disk(ds_path)

In [4]:
ds

Dataset({
    features: ['text', 'meta', 'hash', 'is_original', 'substitutions', 'order'],
    num_rows: 989378
})

In [5]:
swap_arr = np.array(ds["substitutions"])
print(swap_arr.shape)

(989378, 45)


In [6]:
# This random state allows the perturbations to be reproducible
rs = np.random.RandomState(seed=seed)

#take the sequences to perturb
do_sub = []
examples = []
for i, (w1, w2) in tqdm(enumerate(word_pairs), total=len(word_pairs)):
    # create indices
    idx = np.arange(len(swap_arr))
    has_sub = idx[swap_arr[:, i] == 1]
    rs.shuffle(has_sub)
    do_sub.append(list(has_sub[:do_sub_n]))
    
    no_sub = has_sub[do_sub_n:do_sub_n+no_sub_n]
    subset_ds = ds.select(no_sub)
    
    # assert that all examples received the appropriate substitution
    assert(all([ str(i) in j for j in ds.select(do_sub[-1])['order']]))
    assert(all([ f':{i}:' not in '{j}:' for j in ds.select(no_sub)['order']]))
    
    for ex_idx, j in zip(no_sub, subset_ds):
        examples.append((ex_idx, j['text'], j['text'].index(f' {w1} '), w1, w2, False))

  0%|          | 0/45 [00:00<?, ?it/s]

In [7]:
# This random state allows the perturbations to be reproducible
rs = np.random.RandomState(seed=seed)

#take the sequences to perturb
do_sub = []
for i, (w1, w2) in tqdm(enumerate(word_pairs), total=len(word_pairs)):
    # create indices
    idx = np.arange(len(swap_arr))
    has_sub = idx[swap_arr[:, i] == 1]
    rs.shuffle(has_sub)
    do_sub.append(list(has_sub[:do_sub_n]))
    
    subset_ds = ds.select(do_sub[-1])
    
    # assert that all examples received the appropriate substitution
    assert(all([ str(i) in j for j in ds.select(do_sub[-1])['order']]))
    
    for ex_idx, j in zip(do_sub[-1], subset_ds):
        examples.append((ex_idx, j['text'], j['text'].index(f' {w2} '), w1, w2, True))

  0%|          | 0/45 [00:00<?, ?it/s]

In [8]:
df = pd.DataFrame(examples)
df.columns = ['example_index', 'text', 'sub_index', 'original', 'synonym', 'substituted?']
df.to_csv('./propagation_inputs.csv', index=False)

In [9]:
df

Unnamed: 0,example_index,text,sub_index,original,synonym,substituted?
0,880143,Purpose & Goals\n\nWhen Sol Worth and John Ada...,9701,nice,good,False
1,897090,"On Wednesday (March 6th), Brad Garlinghouse, t...",4208,nice,good,False
2,576122,"For Authors, Publishers, and Book-Lovers Alike...",290,nice,good,False
3,20093,Q:\n\nPair javascript objects and assign same ...,866,nice,good,False
4,645693,\n\nCANNONBALL\n\nJOSEPH MCELROY\n\n# Contents...,109779,nice,good,False
...,...,...,...,...,...,...
44995,587478,Football history was made rather quietly over ...,1073,team,group,True
44996,651662,What the Spurs and Mavs said after Friday’s ga...,2286,team,group,True
44997,720880,"Over the years, one-piece swimsuits have been ...",1329,team,group,True
44998,526108,...,44694,team,group,True
