In [1]:
import datasets
from nltk.tokenize import word_tokenize, sent_tokenize

data = datasets.load_dataset("wikipedia", "20220301.en", streaming=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
next(iter(data['train']))['text']

'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished 

In [3]:
data['train']

IterableDataset({
    features: ['id', 'url', 'title', 'text'],
    n_shards: 41
})

In [4]:
import mmh3
from bitarray import bitarray

def hash_token(token):
    bit_array = bitarray(32)
    bit_array.setall(0)
    
    for i in range(3):
        digest = mmh3.hash(token, i) % 32
        bit_array[digest] = True
    return bit_array

In [28]:
import contextlib
import string
import numpy as np

def get_contexts(word, n_occurences):
    complete_bit_array = np.zeros(32, dtype=int)
    occurences = 0

    for i, row in enumerate(data['train']):
        sentences = sent_tokenize(row['text'].lower())
        
        for sentence in sentences:
            tokenized = word_tokenize(sentence.translate(str.maketrans('', '', string.punctuation)))
            indices = [i for i, x in enumerate(tokenized) if x == word]

            if indices:
                bit_array = bitarray(32)
                bit_array.setall(0)

                for index in indices:
                    for delta in [-4, -3, -2, -1, 1, 2, 3, 4]:
                        with contextlib.suppress(IndexError):
                            bit_array ^= hash_token(tokenized[index + delta])
        
                complete_bit_array += np.array(list(bit_array))
                
                occurences += 1
                if occurences >= n_occurences:
                    return complete_bit_array / n_occurences
        
        if i % 100 == 0:
            print(f'"{word}", {i}th row processed, {occurences}/{n_occurences} occurences')

In [25]:
import json

def store_encoding(word):
    with open('encoding.json', 'r') as f:
        encodings = json.load(f)
    encoding = get_contexts(word, 10000)
    encodings.update({word:list(encoding)})
    with open('encoding.json', 'w') as f:
        json.dump(encodings, f)

In [29]:
from multiprocessing.pool import ThreadPool

l = ["men", "women", "kings", "queens", "boys", "girls", "leaf", "leaves"]

pool = ThreadPool(processes=8)

batch_size = 8
for i in range(0, len(l), batch_size):
    sub_list = l[i:i+batch_size]
    pool.map(store_encoding, sub_list)

"woman", 0th row processed, 0/10000 occurences
"queen", 0th row processed, 0/10000 occurences
"woman", 100th row processed, 25/10000 occurences
"queen", 100th row processed, 20/10000 occurences
"woman", 200th row processed, 67/10000 occurences
"queen", 200th row processed, 53/10000 occurences
"woman", 300th row processed, 114/10000 occurences
"queen", 300th row processed, 93/10000 occurences
"woman", 400th row processed, 137/10000 occurences
"queen", 400th row processed, 139/10000 occurences
"woman", 500th row processed, 166/10000 occurences
"queen", 500th row processed, 169/10000 occurences
"woman", 600th row processed, 194/10000 occurences
"queen", 600th row processed, 253/10000 occurences
"woman", 700th row processed, 203/10000 occurences
"queen", 700th row processed, 280/10000 occurences
"woman", 800th row processed, 230/10000 occurences
"queen", 800th row processed, 299/10000 occurences
"woman", 900th row processed, 262/10000 occurences
"queen", 900th row processed, 334/10000 occu