In [1]:
import datasets
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from substitutions import tenk_word_pairs as word_pairs
from collections import Counter
import json

In [2]:
orig_data = "../data/17e7_tokens.jsonl"
out_dataset_name = "17e7_tokens_perturbed"
num_proc = 16

In [3]:
#This converts the jsonl to huggingface
ds = datasets.load_dataset("json", data_files=orig_data)
ds

Found cached dataset json (/home/johnny/.cache/huggingface/datasets/json/default-45df1c8a959db879/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'meta'],
        num_rows: 989378
    })
})

In [4]:
# it's possible that we are perturbing duplicated sequences

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'meta'],
        num_rows: 989378
    })
})

In [6]:
prop_inputs = pd.read_csv('./propagation_inputs.csv', header=[0])
prop_inputs = prop_inputs.set_index('example_index')
prop_inputs.head(1)

Unnamed: 0_level_0,text,sub_index,original,synonym,substituted?,calibrated
example_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
224015,Realme U1 to be Launched on this Date. Know Al...,808,accept,recognize,True,0.140882


In [7]:
prop_inputs = prop_inputs[~prop_inputs.index.duplicated(keep='first')]

In [8]:
edited_ds = ds['train'].add_column('order', [''] * len(ds['train']))

In [9]:
#Performs the map that will perturb the data. Records the perturbation in the "order" section of the data
def edit(x, index):
    order = []
    if index not in prop_inputs.index:
        return x
    
    # only 1 edit per row
    row = prop_inputs.loc[index]
    w1 = row['original']
    w2 = row['synonym']
    try:
        w1_index = x['text'].index(f' {w1} ')
    except:
        print(row)
        raise Exception
        
    assert(row['sub_index'] == w1_index)

    if row['substituted?'] == 1:
        order.append((w1, w1_index))
        new_text = x['text'].replace(f' {w1} ', f' {w2} ', 1)
        assert (new_text != x['text'])
    else:
        new_text = x['text']

    x["text"] = new_text
    x["order"] = json.dumps(order)
    return x

edited_ds = edited_ds.map(
    edit,
    num_proc=num_proc,
    with_indices=True,
    keep_in_memory=True
)

Map (num_proc=16):   0%|          | 0/989378 [00:00<?, ? examples/s]

In [10]:
prop_inputs.head(4)

Unnamed: 0_level_0,text,sub_index,original,synonym,substituted?,calibrated
example_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
224015,Realme U1 to be Launched on this Date. Know Al...,808,accept,recognize,True,0.140882
419293,Ranking the Top 100 Global Brands - Who’s most...,20921,accept,recognize,False,0.242922
239116,\n732 F.Supp.2d 310 (2010)\nIn re the RESERVE ...,20769,accept,recognize,True,0.137564
209489,Chat live right now with one of our online sup...,1053,accept,recognize,True,0.141504


In [18]:
edited_ds[224015]['order']

'[["accept", 808]]'

In [19]:
edited_ds[224015]['text'][808:]

' recognize refreshes with respect to the telephone through the ‘Tell Me’, however the dispatch will just occur on November 28.\n\nWith troublesome cell phones like the Realme 1, Realme 2, Realme 2 Pro and Realme C1 in its lineup, the organization is extending its portfolio to target new sort of clients with its Realme U1. The cell phone is additionally found to have a waterdrop-style score show, like the OnePlus 6T and numerous other recently propelled Android cell phones. Besides, the nearness of MediaTek Helio P70 on the Realme U1 will bring a cluster of new enhancements, for example, high-goals profundity motor, quicker multi-outline clamor decrease, hostile to blossoming motor, and precise AI (man-made consciousness) recognition.'

In [20]:
edited_ds[419293]['order']

'[]'

In [22]:
edited_ds[419293]['text'][20921:]

' accept that there weresome unknowns. If you try to mitigateevery piece of risk, you will be eitherinauthentic or fail.”In addition to CEO Schultz, there wasalso an “everyday” champion. Bruzzoadded, “There needs to be someonewho not only gets social media but canalso translate it for the organization.Alex (Wheeler) is a key part of that.”Having Wheeler was essential, as shewas the person who cajoled, prodded,and convinced everyone to take that ﬁrststep into social media.1213Industry Top 100 Rank Score Channels Social Media TeamAuto 21 54 7 3 peopleBEST PRACTICESToyotaToyota is relatively new to the social media arena, having started in earnestjust two years ago — Toyota launched its YouTube channel in March 2008and established a Twitter proﬁ le in April 2008. Yet with a team of just threepeople, Toyota was able to achieve an engagement score of 54 across7 channels. We spoke with Scott DeYager, Social Media Supervisor, andDenise Morrissey, Online Community Manager, about how they engag

In [15]:
edited_ds

Dataset({
    features: ['text', 'meta', 'order'],
    num_rows: 989378
})

In [16]:
edited_ds.save_to_disk(f'{out_dataset_name}.hf')
edited_ds = datasets.load_from_disk(f'{out_dataset_name}.hf')

Saving the dataset (0/13 shards):   0%|          | 0/989378 [00:00<?, ? examples/s]

In [17]:
#saves the data
# edited_ds.remove_columns(['hash', 'is_original', 'substitutions'])
edited_ds.to_json(f'{out_dataset_name}.jsonl', num_proc=num_proc)

Creating json from Arrow format:   0%|          | 0/990 [00:00<?, ?ba/s]

6446555272