In [2]:
import contextlib
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords, wordnet
import mmh3
import numpy as np
import contextlib
import numpy as np
import string
import nltk
import json

import spacy
import lemminflect
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
POS = ("CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB")

def lemmatize(word):
    token = nlp(str(word))[0]
    lemma = token.lemma_
    inflections = {token._.inflect(pos) for pos in POS}
    return lemma, inflections

nltk.download('stopwords')
nltk.download('wordnet')
en_stopwords = set(stopwords.words('english'))

# data = datasets.load_dataset("wikipedia", "20220301.en")
# data = datasets.load_dataset("bookcorpus/bookcorpus")
# data = data['train']
data = []
for i in range (1, 1731):
    with contextlib.suppress(FileNotFoundError):
        with open(f'data/fairy_tales/{i}.txt', 'rb') as f:
            data.append(f.read())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Yourui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
lemmatize('run')[1]

{'ran', 'run', 'runner', 'runnest', 'running', 'runs'}

In [2]:
def tokenize(sentence):
    tokenized = nlp(sentence.translate(str.maketrans('', '', string.punctuation)))
    return [token.lemma_ for token in tokenized if token.lemma_ not in en_stopwords and wordnet.synsets(token.lemma_)]

In [3]:
data[0]

b'THE LONG WAPPERS, AND THEIR TRICKS\r\n\r\n\r\nIn his rambles in Belgium, the story-teller found no parts of any city\r\nin the land equal in interest to those of old Antwerp. If he sauntered\r\ndown toward evening, into the narrow streets and through the stone\r\ngateway, blackened with age, under which the great Charles V. rode, the\r\nfairies and funny folks seemed almost as near to him as the figures in\r\nreal history. Here, many a prince or princess made their \xe2\x80\x9cjoyous\r\nentry,\xe2\x80\x9d into the wonderful city of Brabo, the boy hero, who slew the\r\ncruel giant Antigonus and cut off his cruel hands.\r\n\r\nHere, the story-teller noticed a great many images of the Virgin Mary;\r\nwhereas, in the newer parts of the city, there were few or none. They\r\nwere usually set in the house corners, where two streets came together.\r\nInquiring into the reason of this, he discovered a new kind of Belgian\r\nfairy, the Wapper, famous for his long legs and funny tricks. Here we

In [4]:
def hash_token(token, bits):
    representation = np.zeros(bits)
    
    for i in range(3):
        digest = mmh3.hash(token, i) % bits
        representation[digest] = 1
    return representation

In [5]:
with open('data/fairytales_full_tf-idf.json', 'r') as f:
    tf_idfs = json.load(f)

In [6]:
def generate_vector(word, tokenized_sentence, bits, deltas, representation, occurrences):
    indices = [i for i, x in enumerate(tokenized_sentence) if x == word]
    for index in indices:
        instance_representation = np.zeros(bits)

        for delta in deltas:
            if index + delta < 0:
                continue
            with contextlib.suppress(IndexError):
                adjacent_word = tokenized_sentence[index + delta]
                try:
                    # tf_idf = tf_idfs[word][adjacent_word]['tf-idf']
                    tf_idf = tf_idfs[word][adjacent_word]
                except KeyError:
                    tf_idf = 0
                instance_representation += np.array(list(hash_token(adjacent_word, bits))) * tf_idf
                # representation += np.array(list(hash_token(adjacent_word, bits))) 
        
        representation += instance_representation
        occurrences += 1
    return representation, occurrences

In [7]:
def extract_vectors(word, n_occurrences=10000, deltas=None, bits=32):
    if deltas is None:
        deltas = [-4, -3, -2, -1, 1, 2, 3, 4]
    word, inflections = lemmatize(word)

    occurrences = 0
    representation = np.zeros(bits)

    for i, row in enumerate(data):
        try:
            sentences = sent_tokenize(row.decode('cp1252').lower())
        except UnicodeDecodeError:
            sentences = sent_tokenize(row.decode('utf8').lower())

        for sentence in sentences:
            if inflections.intersection(set(sentence.lower().split())):
                tokenized = tokenize(sentence)
                representation, occurrences = generate_vector(word, tokenized, bits, deltas, representation, occurrences)
                
                if n_occurrences and occurrences >= n_occurrences:
                    return list(representation / occurrences)

        if i % 10 == 0:
            print(f'"{word}", {i}th row processed, {occurrences}/{n_occurrences} occurrences')

    return list(representation / occurrences)

In [8]:
# keep only the most frequently occurring words next to the target word (ex: 1000+ occurrences) + aggregate those words + collect data
    # Table 1: Most frequent words
    # Table 2: Most frequent words with count vectors
# test discarding bits with high variance values

In [9]:
def store_encoding(word, fname, args):
    vector = extract_vectors(word, **args)
    
    with open(fname, 'r') as f:
        vectors = json.load(f)
    vectors[word] = vector
    with open(fname, 'w') as f:
        json.dump(vectors, f, indent=4)

In [10]:
for value in tf_idfs.keys():
    store_encoding(value, 'data/fairytales_vectors.json', {'n_occurrences':None, 'deltas': [-4, -3, -2, -1, 1, 2, 3, 4], 'bits':32})

"fox", 0th row processed, 0/None occurrences
"fox", 10th row processed, 0/None occurrences
"fox", 20th row processed, 0/None occurrences
"fox", 30th row processed, 17/None occurrences
"fox", 40th row processed, 39/None occurrences
"fox", 50th row processed, 50/None occurrences
"fox", 60th row processed, 51/None occurrences
"fox", 70th row processed, 52/None occurrences
"fox", 80th row processed, 52/None occurrences
"fox", 90th row processed, 70/None occurrences
"fox", 100th row processed, 75/None occurrences
"fox", 110th row processed, 75/None occurrences
"fox", 120th row processed, 90/None occurrences
"fox", 130th row processed, 90/None occurrences
"fox", 140th row processed, 90/None occurrences
"fox", 150th row processed, 97/None occurrences
"fox", 160th row processed, 111/None occurrences
"fox", 170th row processed, 123/None occurrences
"fox", 180th row processed, 123/None occurrences
"fox", 190th row processed, 123/None occurrences
"fox", 200th row processed, 133/None occurrences
"

  return list(representation / occurrences)


"find", 10th row processed, 36/None occurrences
"find", 20th row processed, 49/None occurrences
"find", 30th row processed, 78/None occurrences
"find", 40th row processed, 109/None occurrences
"find", 50th row processed, 153/None occurrences
"find", 60th row processed, 173/None occurrences
"find", 70th row processed, 213/None occurrences
"find", 80th row processed, 230/None occurrences
"find", 90th row processed, 235/None occurrences
"find", 100th row processed, 246/None occurrences
"find", 110th row processed, 257/None occurrences
"find", 120th row processed, 269/None occurrences
"find", 130th row processed, 286/None occurrences
"find", 140th row processed, 301/None occurrences
"find", 150th row processed, 350/None occurrences
"find", 160th row processed, 372/None occurrences
"find", 170th row processed, 386/None occurrences
"find", 180th row processed, 413/None occurrences
"find", 190th row processed, 426/None occurrences
"find", 200th row processed, 444/None occurrences
"find", 210t

KeyboardInterrupt: 

In [None]:
with open('data/fairytales_vectors.json', 'r') as f:
    data = json.load(f)

# removes newlines in lists for readability
import re
def repl_func(match: re.Match):
    return " ".join(match.group().split())
json_str = json.dumps(data, indent=4)
json_str = re.sub(r"(?<=\[)[^\[\]]+(?=])", repl_func, json_str)

with open('data/fairytales_vectors.json', 'w') as f:
    f.write(json_str)