In [2]:
import contextlib
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords, wordnet
import numpy as np
import spacy
import string
import json

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
POS = ("CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB")

def lemmatize(word):
    token = nlp(str(word))[0]
    lemma = token.lemma_
    inflections = {token._.inflect(pos) for pos in POS}
    return lemma, inflections

def tokenize(sentence):
    tokenized = nlp(sentence.translate(str.maketrans('', '', string.punctuation)))
    return [token.lemma_ for token in tokenized if token.lemma_ not in en_stopwords and wordnet.synsets(token.lemma_)]

nltk.download('stopwords')
nltk.download('wordnet')
en_stopwords = set(stopwords.words('english'))

data = []
for i in range (1, 1731):
    with contextlib.suppress(FileNotFoundError):
        with open(f'data/fairy_tales/{i}.txt', 'rb') as f:
            data.append(f.read())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [1]:
import torch

####################### MOVE THE TRAINING TO GPU USING .DEVICE #######################
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# decoded_data = []
# for i, row in enumerate(data):
#     new_doc = ""

#     try:
#         sentences = sent_tokenize(row.decode('cp1252').lower())
#     except UnicodeDecodeError:
#         sentences = sent_tokenize(row.decode('utf8').lower())
#     for sentence in sentences:
#         tokenized = tokenize(sentence)
#         new_doc += " " + " ".join(tokenized)

#     decoded_data.append(new_doc)
#     print(f"{i}th row processed")

# with open('data/fairytales_tokenized_lemmatized.json', 'w') as f:
#     json.dump(decoded_data, f)

In [5]:
with open('fairytales_tokenized_lemmatized.json', 'r') as f:
    decoded_data = json.load(f)

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

cv = CountVectorizer()
word_count_vector = cv.fit_transform(decoded_data)
feature_names = cv.get_feature_names_out()

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(word_count_vector)

idfs = pd.DataFrame(tfidf_transformer.idf_, index=feature_names, columns=["idf_weights"]).sort_values(by=['idf_weights'])

count_vector = cv.transform(decoded_data)
tf_idf_vector = np.asarray(tfidf_transformer.transform(count_vector).todense())

## Compute TF-IDFs and bloom filters for each word

In [8]:
!pip install mmh3
import mmh3

def hash_digests(token, bits):
    return [mmh3.hash(token, i) % bits for i in range(3)]

Collecting mmh3
  Downloading mmh3-5.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading mmh3-5.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (93 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mmh3
Successfully installed mmh3-5.0.1


In [9]:
json_tf_idfs = {f'doc{i}': dict(sorted({feature_names[j]: tf_idf_vector[i][j] for j in range(18249)
                                        if tf_idf_vector[i][j] > 0}.items(),
                                        key=lambda item: item[1], reverse=True))
                for i in range(len(decoded_data))}

In [10]:
json_tf_idfs_bloom_filters = {}

for i in range(len(decoded_data)):
    doc_key = f'doc{i}'       # document key --> ex: doc0, doc1
    tf_idf_bloom_filter = {}  # dictionary to hold tf-idf and bloom filter
    for word, tf_idf in json_tf_idfs[doc_key].items():
        tf_idf_bloom_filter[word] = {
            'tf_idf': tf_idf,
            'bloom_filter': hash_digests(word, 32)  # size of bloom filter=32
        }
    json_tf_idfs_bloom_filters[doc_key] = tf_idf_bloom_filter

print("Done computing tf-idfs and bloom filters in dictionary...")

Done computing tf-idfs and bloom filters in dictionary...


In [12]:
with open('fairytales_doc_tf-idf_bloom_filters.json', 'w') as f:
    json.dump(json_tf_idfs_bloom_filters, f, indent=4)