In [1]:
import datasets
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')
en_stopwords = set(stopwords.words('english'))

data = datasets.load_dataset("wikipedia", "20220301.en", streaming=True)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Yourui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
next(iter(data['train']))['text']

'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished 

In [3]:
data['train']

IterableDataset({
    features: ['id', 'url', 'title', 'text'],
    n_shards: 41
})

In [4]:
import mmh3
from bitarray import bitarray

def hash_token(token, bits):
    bit_array = bitarray(bits)
    bit_array.setall(0)
    
    for i in range(3):
        digest = mmh3.hash(token, i) % bits
        bit_array[digest] = True
    return bit_array

In [5]:
import string

def tokenize(sentence):
    tokenized = word_tokenize(sentence.translate(str.maketrans('', '', string.punctuation)))
    return [lemmatizer.lemmatize(token) for token in tokenized if token not in en_stopwords]

In [6]:
tokenize(sent_tokenize(next(iter(data['train']))['text'].lower())[0])

['anarchism',
 'political',
 'philosophy',
 'movement',
 'sceptical',
 'authority',
 'reject',
 'involuntary',
 'coercive',
 'form',
 'hierarchy']

In [7]:
import contextlib
import numpy as np

def get_contexts(word, n_occurrences=10000, bits=32, deltas=None):
    if deltas is None:
        deltas = [-4, -3, -2, -1, 1, 2, 3, 4]
    word = lemmatizer.lemmatize(word)
        
    complete_bit_array = np.zeros(bits, dtype=int)
    occurrences = 0

    for i, row in enumerate(data['train']):
        sentences = sent_tokenize(row['text'].lower())

        for sentence in sentences:
            tokenized = tokenize(sentence)
            indices = [i for i, x in enumerate(tokenized) if x == word]

            if indices:
                bit_array = bitarray(bits)
                bit_array.setall(0)

                for index in indices:
                    for delta in deltas:
                        with contextlib.suppress(IndexError):
                            bit_array ^= hash_token(tokenized[index + delta], bits)
                            print(tokenized[index + delta])

                complete_bit_array += np.array(list(bit_array))

                occurrences += 1
                if occurrences >= n_occurrences:
                    return complete_bit_array / n_occurrences

        if i % 1000 == 0:
            print(f'"{word}", {i}th row processed, {occurrences}/{n_occurrences} occurrences')

In [None]:
# keep only the most frequently occurring words next to the target word (ex: 1000+ occurrences) + aggregate those words + collect data
    # Table 1: Most frequent words
    # Table 2: Most frequent words with count vectors
# test discarding bits with high variance values

In [8]:
import json

def store_encoding(word, fname, args):
    encoding = get_contexts(word, **args)
    
    with open(fname, 'r') as f:
        encodings = json.load(f)
    encodings.update({word:list(encoding)})
    with open(fname, 'w') as f:
        json.dump(encodings, f, indent=4)

In [9]:
l = ["man","woman","king","queen"]

for value in l:
    store_encoding(value, '4-tokens-64-bits-stopwordless-lemmatized-encoding.json', {'bits': 64, 'n_occurrences':5000, 'deltas': [-2, -1, 1, 2]})

"man", 0th row processed, 0/5000 occurrences
principle
one
one
vote
principle
one
one
vote
principle
one
one
vote
hid
young
court
lycomedes
homosexual
assumed
could
desire
paris
coward
brother
hector
famous
tomb
situated
somewhat
plato
arrogant
named
hippias
generally
honest
socrates
belief
studious
young
hired
riverboat
lincoln
initially
lincoln
strong
party
beat
brook
reported
ever
made
cant
spare
nomination
ask
answer
u
must
take
whose
opinion
belief
young
lincoln
religious
either
party
devised
expected
considered
lincoln
outstanding
ability
much
selfmade
great
war
advocate
common
claimed
would
example
make
one
animal
biped
unity
scala
naturae
top
famously
stated
nature
political
politics
city
rand
mcnally
probably
laugh
ever
get
presence
attractive
considering
love
love
form
referred
courageous
america
compliment
pleased
said
instead
woman
role
mind
existence—and
corollary
essence
concept
heroic
happiness
material
provided
sens
morality
need
survival
qua
survival
qua
referred
evil


In [10]:
with open('4-tokens-64-bits-stopwordless-lemmatized-encoding.json', 'r') as f:
    encodings = json.load(f)
encodings.keys()

dict_keys(['man', 'woman', 'king', 'queen'])

In [11]:
from scipy import spatial

In [12]:
spatial.distance.cosine(np.array(encodings['king'])-np.array(encodings['man'])+np.array(encodings['woman']), np.array(encodings['queen']))

0.019897635546256898

In [13]:
spatial.distance.cosine(np.array(encodings['king'])-np.array(encodings['man'])+np.array(encodings['woman']), np.array(encodings['king']))

0.006773278833053453

In [14]:
spatial.distance.cosine(np.array(encodings['man']),np.array(encodings['woman']))

0.007652318259171453

In [15]:
spatial.distance.cosine(np.array(encodings['man']),np.array(encodings['king']))

0.00571296046610259

In [17]:
spatial.distance.cosine(np.array(encodings['woman']),np.array(encodings['king']))

0.010626778121031522

In [18]:
spatial.distance.cosine(np.array(encodings['queen']),np.array(encodings['king']))

0.014101476561440562