In [28]:
import numpy as np
import datasets

from collections import Counter
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
gpt2_tokenizer = True

In [3]:
if gpt2_tokenizer:
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
chars = 'eеҽ℮ℯⅇ'

In [5]:
tokenizer.encode('hello')

[31373]

In [6]:
tokenizer.encode('h%sllo' % chars[1])

[71, 16843, 18798]

In [7]:
tokenizer.encode('%sll' % chars[1])

[16843, 297]

In [8]:
#This converts the jsonl to huggingface
wikitext = datasets.load_dataset('wikitext', 'wikitext-103-raw-v1')
wikitext

Found cached dataset wikitext (/home/johnny/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [9]:
from datasets import Dataset

def gen():
    prev = 1
    for i, ex in enumerate(wikitext['train']):
        if ex['text'].startswith(' = ') and ex['text'].endswith(' = \n') and ex['text'].count('=') == 2 and i != 1:
            article = wikitext['train'].select(range(prev, i))
            text = ''.join(j['text'] for j in article)
            prev = i
            yield {'text' : text}

In [10]:
# 28457 articles as per https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/
ds = Dataset.from_generator(gen)

Found cached dataset generator (/home/johnny/.cache/huggingface/datasets/generator/default-218b68968f904e41/0.0.0)


In [11]:
# it's possible that we are perturbing duplicated sequences
control_idx = int(1 * 0.01 * len(ds))
control_idx

294

In [13]:
subset = ds.select(range(control_idx))

In [15]:
text = ''.join(subset['text'])

In [38]:
np.mean([ len(i.split(' ')) for i in subset['text'] ])

3024.9795918367345

In [22]:
c = Counter(text.split(' '))

In [55]:
word = 'through'
np.mean([ f' {word} ' in i for i in subset['text'] ])

0.5918367346938775

In [56]:
np.mean([ i.count(f' {word} ') for i in subset['text'] ])

1.8503401360544218

In [65]:
one_or_two = list(filter(lambda x: control_idx*2 > x[1] and x[1] > control_idx, c.most_common()))
eligible = list(filter(lambda x: 'e' in x[0], one_or_two))
eligible

[('under', 573),
 ('years', 572),
 ('then', 562),
 ('second', 562),
 ('United', 544),
 ('century', 537),
 ('well', 532),
 ('became', 528),
 ('New', 498),
 ('She', 494),
 ('there', 493),
 ('After', 476),
 ('several', 470),
 ('began', 467),
 ('end', 461),
 ('these', 426),
 ('same', 410),
 ('because', 406),
 ('early', 403),
 ('called', 399),
 ('team', 395),
 ('released', 395),
 ('people', 394),
 ('five', 393),
 ('However', 392),
 ('line', 374),
 ('American', 373),
 ('They', 365),
 ('episode', 363),
 ('use', 361),
 ('each', 353),
 ('life', 347),
 ('September', 346),
 ('October', 345),
 ('played', 342),
 ('state', 342),
 ('received', 341),
 ('name', 341),
 ('like', 340),
 ('games', 337),
 ('States', 336),
 ('German', 336),
 ('Ireland', 336),
 ('single', 333),
 ('area', 333),
 ('another', 331),
 ('since', 324),
 ('record', 324),
 ('included', 321),
 ('described', 317),
 ('June', 316),
 ('large', 310),
 ('de', 310),
 ('French', 306),
 ('November', 295),
 ('based', 295)]

In [67]:
substitutions = [ (i[0], i[0].replace('e', 'е')) for i in eligible ]
substitutions

[('under', 'undеr'),
 ('years', 'yеars'),
 ('then', 'thеn'),
 ('second', 'sеcond'),
 ('United', 'Unitеd'),
 ('century', 'cеntury'),
 ('well', 'wеll'),
 ('became', 'bеcamе'),
 ('New', 'Nеw'),
 ('She', 'Shе'),
 ('there', 'thеrе'),
 ('After', 'Aftеr'),
 ('several', 'sеvеral'),
 ('began', 'bеgan'),
 ('end', 'еnd'),
 ('these', 'thеsе'),
 ('same', 'samе'),
 ('because', 'bеcausе'),
 ('early', 'еarly'),
 ('called', 'callеd'),
 ('team', 'tеam'),
 ('released', 'rеlеasеd'),
 ('people', 'pеoplе'),
 ('five', 'fivе'),
 ('However', 'Howеvеr'),
 ('line', 'linе'),
 ('American', 'Amеrican'),
 ('They', 'Thеy'),
 ('episode', 'еpisodе'),
 ('use', 'usе'),
 ('each', 'еach'),
 ('life', 'lifе'),
 ('September', 'Sеptеmbеr'),
 ('October', 'Octobеr'),
 ('played', 'playеd'),
 ('state', 'statе'),
 ('received', 'rеcеivеd'),
 ('name', 'namе'),
 ('like', 'likе'),
 ('games', 'gamеs'),
 ('States', 'Statеs'),
 ('German', 'Gеrman'),
 ('Ireland', 'Irеland'),
 ('single', 'singlе'),
 ('area', 'arеa'),
 ('another', 'anothеr')