In [1]:
import contextlib
import mmh3
import numpy as np
import contextlib
import numpy as np
import json

In [2]:
def hash_token(token, bits):
    representation = np.zeros(bits)
    
    for i in range(3):
        digest = mmh3.hash(token, i) % bits
        representation[digest] = 1
    return representation

In [3]:
with open('data/fairytales_word_tf-idfs.json', 'r') as f:
    tf_idfs = json.load(f)
with open('data/fairytales_word_bloom-filters.json', 'r') as f:
    bloom_filters = json.load(f)
with open('data/fairytales_tokenized.json', 'r') as f:
    tokenized_corpus = json.load(f)
with open('data/iterative_vectors/9.json', 'r') as f:
    iterative_vectors = json.load(f)

In [4]:
def generate_vector_from_bloom(word, tokenized_sentence, bits, deltas):
    representations = []
    instance_representation = np.zeros(bits)
    indices = [i for i, x in enumerate(tokenized_sentence) if x == word]
    
    for index in indices:
        adjacent_words = 0
        for delta in deltas:
            if index + delta < 0:
                continue
            with contextlib.suppress(IndexError):
                adjacent_word = tokenized_sentence[index + delta]
                try:
                    tf_idf = tf_idfs[word][adjacent_word]
                except KeyError:
                    tf_idf = 0
                instance_representation += np.array(bloom_filters[adjacent_word]) * tf_idf
                adjacent_words += 1
        representations.append((instance_representation/adjacent_words).tolist())
    return representations

In [None]:
def generate_vector_from_bloom_no_tfidfs(word, tokenized_sentence, bits, deltas):
    representations = []
    instance_representation = np.zeros(bits)
    indices = [i for i, x in enumerate(tokenized_sentence) if x == word]
    
    for index in indices:
        adjacent_words = 0
        for delta in deltas:
            if index + delta < 0:
                continue
            with contextlib.suppress(IndexError):
                adjacent_word = tokenized_sentence[index + delta]
                instance_representation += np.array(bloom_filters[adjacent_word])
                adjacent_words += 1
        representations.append((instance_representation/adjacent_words).tolist())
    return representations

In [6]:
def generate_vector_from_iterative_vectors(word, tokenized_sentence, bits, deltas):
    representations = []
    instance_representation = np.zeros(bits)
    indices = [i for i, x in enumerate(tokenized_sentence) if x == word]
    
    for index in indices:
        adjacent_words = 0
        for delta in deltas:
            if index + delta < 0:
                continue
            with contextlib.suppress(IndexError):
                adjacent_word = tokenized_sentence[index + delta]
                try:
                    tf_idf = tf_idfs[word][adjacent_word]
                except KeyError:
                    tf_idf = 0
                instance_representation += np.array(iterative_vectors[adjacent_word]) * tf_idf
                adjacent_words += 1
        representations.append((instance_representation/adjacent_words).tolist())
    return representations

In [7]:
def extract_vectors(word, generation_function, deltas=None, bits=32):
    if deltas is None:
        deltas = [-4, -3, -2, -1, 1, 2, 3, 4]

    representations = []
    for sentence in tokenized_corpus:
        if word in sentence:
            representations += generation_function(word, sentence, bits, deltas)
    return representations

In [8]:
def store_encoding(word, fname, args):
    vector = extract_vectors(word, **args)
    
    with open(fname, 'r') as f:
        vectors = json.load(f)
    vectors[word] = vector
    with open(fname, 'w') as f:
        json.dump(vectors, f, indent=4)

In [9]:
words = ['king', 'queen', 'man', 'woman', 'long', 'man', 'say', 'water']
for word in words:
    store_encoding(word, 'data/indiv_word_representations/generate_vector_from_bloom.json', 
                   {'deltas': [-4, -3, -2, -1, 1, 2, 3, 4], 'bits':32, 'generation_function':generate_vector_from_bloom})
    store_encoding(word, 'data/indiv_word_representations/generate_vector_from_bloom_no_tfidfs.json', 
                   {'deltas': [-4, -3, -2, -1, 1, 2, 3, 4], 'bits':32, 'generation_function':generate_vector_from_bloom_no_tfidfs})
    store_encoding(word, 'data/indiv_word_representations/generate_vector_from_iterative_vectors.json', 
                   {'deltas': [-4, -3, -2, -1, 1, 2, 3, 4], 'bits':32, 'generation_function':generate_vector_from_iterative_vectors})