In [1]:
import datasets
from nltk.tokenize import word_tokenize, sent_tokenize

data = datasets.load_dataset("wikipedia", "20220301.en", streaming=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
next(iter(data['train']))['text']

'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished 

In [3]:
data['train']

IterableDataset({
    features: ['id', 'url', 'title', 'text'],
    n_shards: 41
})

In [4]:
import mmh3
from bitarray import bitarray

def hash_token(token):
    bit_array = bitarray(32)
    bit_array.setall(0)
    
    for i in range(3):
        digest = mmh3.hash(token, i) % 32
        bit_array[digest] = True
    return bit_array

In [5]:
import contextlib
import string
import numpy as np

def get_contexts(word, n_occurences):
    complete_bit_array = np.zeros(32, dtype=int)
    occurences = 0

    for i, row in enumerate(data['train']):
        sentences = sent_tokenize(row['text'].lower())
        
        for sentence in sentences:
            tokenized = word_tokenize(sentence.translate(str.maketrans('', '', string.punctuation)))
            indices = [i for i, x in enumerate(tokenized) if x == word]

            if indices:
                bit_array = bitarray(32)
                bit_array.setall(0)

                for index in indices:
                    for delta in [-4, -3, -2, -1, 1, 2, 3, 4]:
                        with contextlib.suppress(IndexError):
                            bit_array ^= hash_token(tokenized[index + delta])
        
                complete_bit_array += np.array(list(bit_array))
                
                occurences += 1
                if occurences >= n_occurences:
                    return complete_bit_array / n_occurences
        
        if i % 1000 == 0:
            print(f'"{word}", {i}th row processed, {occurences}/{n_occurences} occurences')

In [6]:
import json

def store_encoding(word):
    encoding = get_contexts(word, 10000)
    
    with open('encoding.json', 'r') as f:
        encodings = json.load(f)
    encodings.update({word:list(encoding)})
    with open('encoding.json', 'w') as f:
        json.dump(encodings, f, indent=4)

In [7]:
l = ["boy","king","queen","men", "women", "kings", "queens", "boys", "girls", "leaf", "leaves"]

for value in l:
    store_encoding(value)

"boy", 0th row processed, 0/10000 occurences
"boy", 1000th row processed, 94/10000 occurences
"boy", 2000th row processed, 215/10000 occurences
"boy", 3000th row processed, 408/10000 occurences
"boy", 4000th row processed, 538/10000 occurences
"boy", 5000th row processed, 664/10000 occurences
"boy", 6000th row processed, 943/10000 occurences
"boy", 7000th row processed, 1215/10000 occurences
"boy", 8000th row processed, 1404/10000 occurences
"boy", 9000th row processed, 1531/10000 occurences
"boy", 10000th row processed, 1672/10000 occurences
"boy", 11000th row processed, 1856/10000 occurences
"boy", 12000th row processed, 1950/10000 occurences
"boy", 13000th row processed, 2076/10000 occurences
"boy", 14000th row processed, 2234/10000 occurences
"boy", 15000th row processed, 2388/10000 occurences
"boy", 16000th row processed, 2574/10000 occurences
"boy", 17000th row processed, 2783/10000 occurences
"boy", 18000th row processed, 2814/10000 occurences
"boy", 19000th row processed, 2916/

'HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/wikipedia/resolve/main/data/20220301.en/train-00000-of-00041.parquet
Retrying in 1s [Retry 1/5].
'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 71cd6514-d278-4f72-a8e7-eb020ce450a8)')' thrown while requesting GET https://huggingface.co/datasets/wikipedia/resolve/main/data/20220301.en/train-00000-of-00041.parquet
Retrying in 2s [Retry 2/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /datasets/wikipedia/resolve/main/data/20220301.en/train-00000-of-00041.parquet (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x15f8914f0>: Failed to resolve \'huggingface.co\' ([Errno 8] nodename nor servname provided, or not known)"))'), '(Request ID: db7875cc-ce5d-4cf5-a3ab-ec42207819f3)')' thrown while

"girls", 14000th row processed, 1423/10000 occurences
"girls", 15000th row processed, 1673/10000 occurences
"girls", 16000th row processed, 1802/10000 occurences
"girls", 17000th row processed, 1936/10000 occurences
"girls", 18000th row processed, 1952/10000 occurences
"girls", 19000th row processed, 2002/10000 occurences
"girls", 20000th row processed, 2106/10000 occurences
"girls", 21000th row processed, 2146/10000 occurences
"girls", 22000th row processed, 2223/10000 occurences
"girls", 23000th row processed, 2341/10000 occurences
"girls", 24000th row processed, 2456/10000 occurences
"girls", 25000th row processed, 2539/10000 occurences
"girls", 26000th row processed, 2622/10000 occurences
"girls", 27000th row processed, 2801/10000 occurences
"girls", 28000th row processed, 2966/10000 occurences
"girls", 29000th row processed, 3081/10000 occurences
"girls", 30000th row processed, 3150/10000 occurences
"girls", 31000th row processed, 3250/10000 occurences
"girls", 32000th row process

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: c2300711-2086-4b17-a2d5-5a362f21ae13)')' thrown while requesting GET https://huggingface.co/datasets/wikipedia/resolve/main/data/20220301.en/train-00000-of-00041.parquet
Retrying in 1s [Retry 1/5].


"leaves", 20000th row processed, 3058/10000 occurences
"leaves", 21000th row processed, 3168/10000 occurences
"leaves", 22000th row processed, 3225/10000 occurences
"leaves", 23000th row processed, 3324/10000 occurences
"leaves", 24000th row processed, 3484/10000 occurences
"leaves", 25000th row processed, 3592/10000 occurences
"leaves", 26000th row processed, 3767/10000 occurences
"leaves", 27000th row processed, 3935/10000 occurences
"leaves", 28000th row processed, 4123/10000 occurences
"leaves", 29000th row processed, 4323/10000 occurences
"leaves", 30000th row processed, 4475/10000 occurences
"leaves", 31000th row processed, 4628/10000 occurences
"leaves", 32000th row processed, 4775/10000 occurences
"leaves", 33000th row processed, 4904/10000 occurences
"leaves", 34000th row processed, 5165/10000 occurences
"leaves", 35000th row processed, 5311/10000 occurences
"leaves", 36000th row processed, 5489/10000 occurences
"leaves", 37000th row processed, 5681/10000 occurences
"leaves", 

In [8]:
with open('encoding.json', 'r') as f:
    encodings = json.load(f)
encodings.keys()

dict_keys(['into', 'man', 'woman', 'girl', 'onto', 'boy', 'king', 'queen', 'men', 'women', 'kings', 'queens', 'boys', 'girls', 'leaf', 'leaves'])

In [19]:
from scipy import spatial
spatial.distance.euclidean(np.array(encodings['man']), np.array(encodings['woman']))

0.1451700037886615

In [31]:
spatial.distance.cosine(np.array(encodings['man']), np.array(encodings['leaf']))

0.0028590528280839633

In [30]:
spatial.distance.cosine(np.array(encodings['king'])-np.array(encodings['man'])+np.array(encodings['woman']), np.array(encodings['queen']))

0.004245849709624494

In [32]:
spatial.distance.cosine(np.array(encodings['king'])-np.array(encodings['man'])+np.array(encodings['woman']), np.array(encodings['king']))

0.002154191942113748

In [27]:
spatial.distance.cosine(np.array(encodings['man']), np.array(encodings['men']))

0.004334619151494801

In [26]:
spatial.distance.cosine(np.array(encodings['woman']), np.array(encodings['women']))

0.00881123061384359

In [28]:
spatial.distance.cosine(np.array(encodings['men'])-np.array(encodings['man'])+np.array(encodings['woman']), np.array(encodings['women']))

0.0031261922476842896

In [29]:
spatial.distance.cosine(np.array(encodings['man']), np.array(encodings['king']))

0.004021753943264317