In [1]:
import contextlib
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords, wordnet
import mmh3
import numpy as np
import contextlib
import numpy as np
import string
import nltk
import json

import spacy
import lemminflect
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
POS = ("CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB")

def lemmatize(word): # Takes a word and uses the spacy lemmatizer to return the lemmatized form
    token = nlp(str(word))[0]
    lemma = token.lemma_
    inflections = {token._.inflect(pos) for pos in POS} # returns the inflection of the lemmatized token. (ex: run -> {'ran', 'run', 'runner', 'runnest', 'running', 'runs'} )
    return lemma, inflections

def tokenize(sentence): # Tokenizes a sentence and lemmatizes the words within
    tokenized = nlp(sentence.translate(str.maketrans('', '', string.punctuation)))
    return [token.lemma_ for token in tokenized if token.lemma_.lower() not in en_stopwords and wordnet.synsets(token.lemma_)] # disregards lemmatized token if it's in list of stopwords or not in english dictionary (wordnet)

nltk.download('stopwords')
nltk.download('wordnet')
en_stopwords = set(stopwords.words('english'))

data = [] # loads the datasets as binaries 
for i in range (1, 1731):
    with contextlib.suppress(FileNotFoundError):
        with open(f'data/fairy_tales/{i}.txt', 'r', encoding='ISO-8859-1') as f:
            data.append(f.read())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Yourui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data[0]

'THE LONG WAPPERS, AND THEIR TRICKS\n\n\nIn his rambles in Belgium, the story-teller found no parts of any city\nin the land equal in interest to those of old Antwerp. If he sauntered\ndown toward evening, into the narrow streets and through the stone\ngateway, blackened with age, under which the great Charles V. rode, the\nfairies and funny folks seemed almost as near to him as the figures in\nreal history. Here, many a prince or princess made their â\x80\x9cjoyous\nentry,â\x80\x9d into the wonderful city of Brabo, the boy hero, who slew the\ncruel giant Antigonus and cut off his cruel hands.\n\nHere, the story-teller noticed a great many images of the Virgin Mary;\nwhereas, in the newer parts of the city, there were few or none. They\nwere usually set in the house corners, where two streets came together.\nInquiring into the reason of this, he discovered a new kind of Belgian\nfairy, the Wapper, famous for his long legs and funny tricks. Here were\nfairies on stilts.\n\nThis mischiev

In [3]:
def hash_token(token, bits):
    representation = np.zeros(bits)
    
    for i in range(3):
        digest = mmh3.hash(token, i) % bits
        representation[digest] = 1
    return representation

In [4]:
with open('data/fairytales_iterative_vectors.json', 'r') as f:
    iterative_vectors = json.load(f)
with open('data/fairytales_word_tf-idfs.json', 'r') as f:
    tf_idfs = json.load(f)
with open('data/fairytales_word_bloom-filters.json', 'r') as f:
    bloom_filters = json.load(f)
with open('data/fairytales_tokenized.json', 'r') as f:
    tokenized_corpus = json.load(f)

In [5]:
def generate_vector(word, tokenized_sentence, bits, deltas):
    indices = [i for i, x in enumerate(tokenized_sentence) if x == word]
    instance_representation = np.zeros(bits)
    adjacent_words = 0
    
    for index in indices:
        for delta in deltas:
            if index + delta < 0:
                continue
            with contextlib.suppress(IndexError):
                adjacent_word = tokenized_sentence[index + delta]
                try:
                    tf_idf = tf_idfs[word][adjacent_word]
                except KeyError:
                    tf_idf = 0
                try:
                    instance_representation += np.array(iterative_vectors[adjacent_word]) * tf_idf
                except:
                    instance_representation += np.array(bloom_filters[adjacent_word]) * tf_idf 
                    # generate new bloom filter to represent word if vector is not found
                adjacent_words += 1
    return instance_representation, adjacent_words

In [None]:
def extract_vectors(word, deltas=None, bits=32):
    if deltas is None:
        deltas = [-4, -3, -2, -1, 1, 2, 3, 4]

    total_adjacent_words = 0
    representations = np.zeros(bits)

    for sentence in tokenized_corpus:
        if word in sentence:
            representation, adjacent_words = generate_vector(word, sentence, bits, deltas)
            representations += representation
            total_adjacent_words += adjacent_words
    iterative_vectors[word] = representations
    return representations / float(total_adjacent_words)

In [7]:
# keep only the most frequently occurring words next to the target word (ex: 1000+ occurrences) + aggregate those words + collect data
    # Table 1: Most frequent words
    # Table 2: Most frequent words with count vectors
# test discarding bits with high variance values

In [8]:
def store_encoding(word, fname, args):
    vector = list(extract_vectors(word, **args))
    
    with open(fname, 'r') as f:
        iterative_vectors = json.load(f)
    iterative_vectors[word] = vector
    with open(fname, 'w') as f:
        json.dump(iterative_vectors, f, indent=4)

In [None]:
ITERATIONS = 50
for i in range(ITERATIONS): 
    for word in tf_idfs.keys():
        print(f"iteration {i}, \"{word}\"")
        store_encoding(word, 'data/fairytales_iterative_vectors.json', {'deltas': [-4, -3, -2, -1, 1, 2, 3, 4], 'bits':32})

iteration 0
"long", total_adjacent_words: 32899
"trick", total_adjacent_words: 1582
"ramble", total_adjacent_words: 158
"belgium", total_adjacent_words: 316
"storyteller", total_adjacent_words: 413
"find", total_adjacent_words: 36273
"part", total_adjacent_words: 6303
"city", total_adjacent_words: 5898
"land", total_adjacent_words: 10908
"equal", total_adjacent_words: 819
"interest", total_adjacent_words: 512
"old", total_adjacent_words: 46379
"antwerp", total_adjacent_words: 95
"saunter", total_adjacent_words: 77
"evening", total_adjacent_words: 6756
"narrow", total_adjacent_words: 739
"street", total_adjacent_words: 2018
"stone", total_adjacent_words: 10053
"gateway", total_adjacent_words: 211
"blacken", total_adjacent_words: 74
"age", total_adjacent_words: 2820
"great", total_adjacent_words: 37786
"charles", total_adjacent_words: 107
"v", total_adjacent_words: 189
"ride", total_adjacent_words: 8998
"fairy", total_adjacent_words: 12710
"funny", total_adjacent_words: 468
"folk", total

  return representations / float(total_adjacent_words)


"taker", total_adjacent_words: 3
"pretended", total_adjacent_words: 56
"glum", total_adjacent_words: 26
"jaundiced", total_adjacent_words: 6
"doubtfully", total_adjacent_words: 52
"gingerly", total_adjacent_words: 2
"moodily", total_adjacent_words: 13
"dutiful", total_adjacent_words: 64
"truculently", total_adjacent_words: 6
"indignity", total_adjacent_words: 41
"miscreant", total_adjacent_words: 24
"flaying", total_adjacent_words: 8
"dismemberment", total_adjacent_words: 8
"ingenious", total_adjacent_words: 43
"judgement", total_adjacent_words: 77
"blighted", total_adjacent_words: 8
"sober", total_adjacent_words: 86
"soberly", total_adjacent_words: 11
"quietness", total_adjacent_words: 18
"boulder", total_adjacent_words: 164
"aloof", total_adjacent_words: 23
"infinity", total_adjacent_words: 21
"rhythmic", total_adjacent_words: 25
"billowing", total_adjacent_words: 4
"wildest", total_adjacent_words: 57
"twitch", total_adjacent_words: 56
"significant", total_adjacent_words: 32
"stateme

Traceback (most recent call last):
  File "/Users/Yourui/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py", line 3442, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/h1/88vswkjs65x3v7m1ytlwgpy40000gp/T/ipykernel_6506/3486194016.py", line 4, in <module>
    store_encoding(word, 'data/fairytales_iterative_vectors.json', {'deltas': [-4, -3, -2, -1, 1, 2, 3, 4], 'bits':32})
  File "/var/folders/h1/88vswkjs65x3v7m1ytlwgpy40000gp/T/ipykernel_6506/201905668.py", line 2, in store_encoding
    vector = list(extract_vectors(word, **args))
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/h1/88vswkjs65x3v7m1ytlwgpy40000gp/T/ipykernel_6506/4129541017.py", line -1, in extract_vectors
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/Yourui/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py", line 20