In [1]:
import spacy
from nltk.corpus import wordnet
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

lemmatize = lambda word: nlp(str(word))[0].lemma_

import json
with open('data/wikipedia_20000_frequencies.json', 'r') as f:
    frequencies = json.load(f)

import pandas as pd
for k, v in frequencies.items(): # applies new lemmatizer
    frequencies[k] = pd.DataFrame({'word':v.keys(), 'frequency':v.values()}) 
    frequencies[k] = frequencies[k].loc[frequencies[k]['word'].map(lambda x: bool(wordnet.synsets(str(x))))]
    frequencies[k]['lemmas'] = frequencies[k]['word'].map(lemmatize)
    frequencies[k] = frequencies[k].groupby(frequencies[k]['lemmas']).aggregate({'frequency': 'sum'})
    frequencies[k] = frequencies[k][pd.notnull(frequencies[k].index)].to_dict()['frequency']

with open('data/wikipedia_20000_frequencies.json', 'w') as f:
    json.dump(frequencies, f, indent=4)

KeyboardInterrupt: 

In [2]:
en_wiki_frequencies = pd.read_csv('data/enwiki-2023-04-13.txt', header=None, sep=" ", index_col=0)

In [3]:
valid_entries = en_wiki_frequencies.index.map(lambda x: bool(wordnet.synsets(str(x))))
en_wiki_frequencies = en_wiki_frequencies.loc[valid_entries]

In [4]:
en_wiki_frequencies['lemmas'] = en_wiki_frequencies.index.map(lemmatize)
en_wiki_frequencies = en_wiki_frequencies.groupby(en_wiki_frequencies['lemmas']).aggregate({1: 'sum'})

en_wiki_frequencies = en_wiki_frequencies[pd.notnull(en_wiki_frequencies.index)].to_dict()

In [5]:
en_wiki_frequencies = en_wiki_frequencies[1]

In [6]:
with open('data/lemma_safe_enwiki_frequencies.json', 'w') as f:
    json.dump(en_wiki_frequencies, f)

In [7]:
tf_idfs = {}

In [8]:
for word in frequencies.keys():
    tf_idfs[word] = {}
    for adjacent_word in frequencies[word].keys():
        if frequencies[word][adjacent_word] < 5 or adjacent_word not in en_wiki_frequencies:
            continue
        tf_idfs[word][adjacent_word] = frequencies[word][adjacent_word] * 1.0 / en_wiki_frequencies[adjacent_word]

In [9]:
import mmh3

def hash_digests(token, bits):
    return [mmh3.hash(token, i) % bits for i in range(3)]

In [10]:
for key in tf_idfs.keys():
    tf_idfs[key] = dict(sorted(tf_idfs[key].items(), key=lambda item: item[1], reverse=True))
for key in tf_idfs.keys():
    for word in tf_idfs[key].keys():
        tf_idfs[key][word] = {
            'tf-idf': tf_idfs[key][word],
            'bloom_filter': list(hash_digests(word, 32))
        }

In [11]:
# removes newlines in lists for readability
import re
def repl_func(match: re.Match):
    return " ".join(match.group().split())

json_str = json.dumps(tf_idfs, indent=4)
json_str = re.sub(r"(?<=\[)[^\[\]]+(?=])", repl_func, json_str)

with open('data/wikipedia_20000_tf-idf.json', 'w') as f:
    f.write(json_str)