In [1]:
import contextlib
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords, wordnet
import numpy as np
import spacy
import string
import json

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
POS = ("CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB")

def lemmatize(word):
    token = nlp(str(word))[0]
    lemma = token.lemma_
    inflections = {token._.inflect(pos) for pos in POS}
    return lemma, inflections

def tokenize(sentence):
    tokenized = nlp(sentence.translate(str.maketrans('', '', string.punctuation)))
    return [token.lemma_ for token in tokenized if token.lemma_ not in en_stopwords and wordnet.synsets(token.lemma_)]

nltk.download('stopwords')
nltk.download('wordnet')
en_stopwords = set(stopwords.words('english'))

data = []
for i in range (1, 1731):
    with contextlib.suppress(FileNotFoundError):
        with open(f'data/fairy_tales/{i}.txt', 'rb') as f:
            data.append(f.read())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Yourui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Yourui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# decoded_data = []
# for i, row in enumerate(data):
#     new_doc = ""
    
#     try:
#         sentences = sent_tokenize(row.decode('cp1252').lower())
#     except UnicodeDecodeError:
#         sentences = sent_tokenize(row.decode('utf8').lower())
#     for sentence in sentences:
#         tokenized = tokenize(sentence)
#         new_doc += " " + " ".join(tokenized)
    
#     decoded_data.append(new_doc)
#     print(f"{i}th row processed")

# with open('data/fairytales_tokenized_lemmatized.json', 'w') as f:
#     json.dump(decoded_data, f)

In [3]:
with open('data/fairytales_tokenized_lemmatized.json', 'r') as f:
    decoded_data = json.load(f)

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

cv = CountVectorizer()
word_count_vector = cv.fit_transform(decoded_data)
feature_names = cv.get_feature_names_out()

tfidf_transformer = TfidfTransformer() 
tfidf_transformer.fit(word_count_vector)

idfs = pd.DataFrame(tfidf_transformer.idf_, index=feature_names, columns=["idf_weights"]).sort_values(by=['idf_weights'])

count_vector = cv.transform(decoded_data)
tf_idf_vector = np.asarray(tfidf_transformer.transform(count_vector).todense())

In [49]:
json_tf_idfs = {f'doc{i}': dict(sorted({feature_names[j]: tf_idf_vector[i][j] for j in range(18249)
                                        if tf_idf_vector[i][j] > 0}.items(), 
                                        key=lambda item: item[1], reverse=True))
                for i in range(len(decoded_data))}

In [53]:
with open('data/fairytales_doc_tf-idf.json', 'w') as f:
    json.dump(json_tf_idfs, f, indent=4)