In [55]:
import json
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import defaultdict
from spellchecker import SpellChecker
from tqdm.notebook import tqdm

nltk.download(['punkt', 'stopwords'], quiet=True);

In [47]:
filename = 'data/processed_reviews.json'
with open(filename, 'rb') as file:
    reviews = json.load(file)

In [None]:
initial_bag_of_words = defaultdict(set)
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
for review in tqdm(reviews):
    text = review['text'].lower()
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = nltk.word_tokenize(text)
    for word in text:
        if word.isalnum() and word not in stop_words:
            word = lemmatizer.lemmatize(word)
            initial_bag_of_words[review['product_id']].add(word)

counter = defaultdict(int)
for words in initial_bag_of_words.values():
    for word in words:
        counter[word] += 1

keep_words = set()
for word, count in counter.items():
    n = len(initial_bag_of_words)
    if (count < n*.9) or (count > n*.01):
        keep_words.add(word)

reduced_bag_of_words = defaultdict(set)
for product, words in tqdm(initial_bag_of_words.items()):
    for word in words:
        if word in keep_words:
            reduced_bag_of_words[product].add(word)

corrections = {}
spellcheck = SpellChecker()
for word in tqdm(spellcheck.unknown(keep_words)):
    corrections[word] = spellcheck.correction(word)

bag_of_words = defaultdict(set)
for product, words in tqdm(reduced_bag_of_words.items()):
    for word in words:
        if word in corrections:
            bag_of_words[product].add(corrections[word])
        else:
            bag_of_words[product].add(word)

bag_of_words = {product: list(bag_of_words[product]) for product in bag_of_words}

In [61]:
filename = 'data/bag_of_words.json'
with open(filename, 'w') as file:
    json.dump(bag_of_words, file)

HBox(children=(FloatProgress(value=0.0, max=2188350.0), HTML(value='')))





In [None]:
corpus = set()
for words in bag_of_words.values():
    for word in words:
        corpus.add(word)
corpus = list(corpus)

In [107]:
filename = 'data/corpus.json'
with open(filename, 'w') as file:
    json.dump(corpus, file)