In [1]:
import contextlib
import datasets
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet, words
from nltk.stem import WordNetLemmatizer
import mmh3
import numpy as np
from bitarray import bitarray
import contextlib
import numpy as np
import time
import string
import nltk
import json

import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

lemmatize = lambda word: nlp(str(word))[0].lemma_

nltk.download('stopwords')
nltk.download('wordnet')
en_stopwords = set(stopwords.words('english'))

data = datasets.load_dataset("wikipedia", "20220301.en")
# data = datasets.load_dataset("bookcorpus/bookcorpus")
# data = []
# for i in range (1, 1731):
#     with contextlib.suppress(FileNotFoundError):
#         with open(f'data/fairy_tales/{i}.txt', 'rb') as f:
#             data.append(f.read())

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Yourui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = data['train']

In [3]:
def tokenize(sentence):
    tokenized = nlp(sentence.translate(str.maketrans('', '', string.punctuation)))
    return [token.lemma_ for token in tokenized if token.lemma_ not in en_stopwords and wordnet.synsets(token.lemma_)]

In [4]:
tokenize(sent_tokenize(data[0]['text'])[0])

['anarchism',
 'political',
 'philosophy',
 'movement',
 'sceptical',
 'authority',
 'reject',
 'involuntary',
 'coercive',
 'form',
 'hierarchy']

In [5]:
def hash_token(token, bits):
    representation = np.zeros(32)
    
    for i in range(3):
        digest = mmh3.hash(token, i) % bits
        representation[digest] = 1
    return representation

In [6]:
with open('data/wikipedia_20000_tf-idf.json', 'r') as f:
    tf_idfs = json.load(f)

In [7]:
def get_bloom_filters(word, n_occurrences=10000, deltas=None, bits=32):
    if deltas is None:
        deltas = [-4, -3, -2, -1, 1, 2, 3, 4]
    word = lemmatize(word) # using the old lemmatizer, shouldn't necessarily be an issue right now

    return_data = []
    occurrences = 0

    for i, row in enumerate(data):
        # try:
        #     sentences = sent_tokenize(row['text'].decode('cp1252').lower())
        # except UnicodeDecodeError:
        #     sentences = sent_tokenize(row['text'].decode('utf8').lower())

        sentences = sent_tokenize(row['text'].lower())

        for sentence in sentences:
            if word in sentence:
                tokenized = tokenize(sentence)

                indices = [i for i, x in enumerate(tokenized) if x == word]
                for index in indices:
                    representation = np.zeros(32)

                    for delta in deltas:
                        with contextlib.suppress(IndexError):
                            adjacent_word = tokenized[index + delta]
                            try:
                                tf_idf = tf_idfs[word][adjacent_word]['tf-idf']
                            except KeyError:
                                tf_idf = 0
                            representation += np.array(list(hash_token(adjacent_word, bits))) * tf_idf

                    return_data.append(list(representation))
                    occurrences += 1

                    if n_occurrences and occurrences >= n_occurrences:
                        return return_data

        if i % 10 == 0:
            print(f'"{word}", {i}th row processed, {occurrences}/{n_occurrences} occurrences')

    return return_data

In [8]:
# keep only the most frequently occurring words next to the target word (ex: 1000+ occurrences) + aggregate those words + collect data
    # Table 1: Most frequent words
    # Table 2: Most frequent words with count vectors
# test discarding bits with high variance values

In [9]:
def store_encoding(word, fname, args):
    frequencies = get_bloom_filters(word, **args)
    
    with open(fname, 'r') as f:
        encodings = json.load(f)
    encodings[word] = frequencies
    with open(fname, 'w') as f:
        json.dump(encodings, f, indent=4)

In [10]:
l = ["man","woman","king","queen"]

for value in l:
    store_encoding(value, 'data/wikipedia_20000_vectors.json', {'n_occurrences':20000, 'deltas': [-4, -3, -2, -1, 1, 2, 3, 4]})

"man", 0th row processed, 0/20000 occurrences
"man", 10th row processed, 33/20000 occurrences
"man", 20th row processed, 67/20000 occurrences
"man", 30th row processed, 81/20000 occurrences
"man", 40th row processed, 91/20000 occurrences
"man", 50th row processed, 91/20000 occurrences
"man", 60th row processed, 115/20000 occurrences
"man", 70th row processed, 122/20000 occurrences
"man", 80th row processed, 138/20000 occurrences
"man", 90th row processed, 145/20000 occurrences
"man", 100th row processed, 154/20000 occurrences
"man", 110th row processed, 165/20000 occurrences
"man", 120th row processed, 171/20000 occurrences
"man", 130th row processed, 194/20000 occurrences
"man", 140th row processed, 222/20000 occurrences
"man", 150th row processed, 243/20000 occurrences
"man", 160th row processed, 260/20000 occurrences
"man", 170th row processed, 265/20000 occurrences
"man", 180th row processed, 267/20000 occurrences
"man", 190th row processed, 278/20000 occurrences
"man", 200th row p

KeyboardInterrupt: 

In [116]:
with open('data/wikipedia_20000_vectors.json', 'r') as f:
    data = json.load(f)

# removes newlines in lists for readability
import re
def repl_func(match: re.Match):
    return " ".join(match.group().split())
json_str = json.dumps(data, indent=4)
json_str = re.sub(r"(?<=\[)[^\[\]]+(?=])", repl_func, json_str)

with open('data/wikipedia_20000_vectors.json', 'w') as f:
    f.write(json_str)

In [117]:
data

{'man': [[0.0,
   0.0,
   0.0,
   0.0,
   0.00041816679489050155,
   0.0,
   0.00011757377762656712,
   0.0,
   4.130028309466776e-05,
   0.00015204781661663688,
   0.0,
   0.0,
   0.0002886172397443064,
   6.487599101762416e-05,
   0.0,
   0.00024731695664963866,
   6.487599101762416e-05,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.00026026110911370637,
   0.0,
   0.00019558754498513536,
   0.00019558754498513536,
   0.0,
   8.737425248806585e-05,
   0.0,
   3.019952513850127e-05,
   0.0],
  [0.0,
   0.0,
   0.0,
   0.0,
   0.0003121929476672628,
   0.0,
   3.019952513850127e-05,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.00024731695664963866,
   6.487599101762416e-05,
   0.0,
   0.00024731695664963866,
   6.487599101762416e-05,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.00019558754498513536,
   0.0,
   0.00019558754498513536,
   0.00019558754498513536,
   0.0,
   0.0,
   0.0,
   3.019952513850127e-05,
   0.0],
  [0.0,
   1.2392392723186993e-05,
   0.0,
   1.23923

In [71]:
labels = np.concatenate([np.array([word for _ in data[word]]) for word in data.keys()])
lookup, clabels = np.unique(labels, return_inverse=True)
vectors = np.concatenate([np.array(list(word_data)) for word_data in data.values()])

import random
c = list(zip(clabels, vectors))
random.shuffle(c)
clabels, vectors = zip(*c)

lookup

array(['king', 'man', 'queen', 'woman'], dtype='<U5')

In [72]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
features_pca = pca.fit_transform(vectors)

In [73]:
[mpatches.Patch(color=cmap(b)) for b in range(4)]

[<matplotlib.patches.Patch at 0x313b81c40>,
 <matplotlib.patches.Patch at 0x313b9d490>,
 <matplotlib.patches.Patch at 0x313b4cb90>,
 <matplotlib.patches.Patch at 0x313b4c410>]

In [102]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

cmap = plt.cm.get_cmap('tab10', 4)

plt.figure(figsize=(5,5))
plt.scatter(features_pca[:,0], features_pca[:,1], c=clabels, cmap='tab10', s=8, marker='x')

# Add the axis labels
plt.xlabel('PC 1 (%.2f%%)' % (pca.explained_variance_ratio_[0]*100))
plt.ylabel('PC 2 (%.2f%%)' % (pca.explained_variance_ratio_[1]*100)) 

plt.legend([mpatches.Patch(color=cmap(b)) for b in range(4)], lookup)

plt.savefig('pca.png', dpi=300)
plt.show()

  cmap = plt.cm.get_cmap('tab10', 4)


<IPython.core.display.Javascript object>