Author: Julia Wervers
Student Number: 13168665

This is the main script for the final project of the data processing course. The objective of the project is to find how representative lists of keywords extracted from summaries of books compared to keywords extracted from the book themselves. 

In [79]:
#Imports
import spacy
from collections import Counter
import math

# Load a large nlp pipeline with vectors
nlp = spacy.load("en_core_web_lg")

In [80]:
###Functions

#Function to get wordlist of a textfile
def get_book_wordlist(textfile):
    #Get text 
    f = open(f'data/{textfile}', "r", encoding="utf8")
    text = f.read()
    f.close()


    doc = nlp(text)
    #Iterate through the tokens and get the lowercase lemma of each token if the token is not a stopword and is not punctuation
    lemmas = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

    #clean up the extracted text
    processed_text = " ".join(lemmas).strip()
    #Split the text up using space into a list of words
    wordlist = processed_text.split()
                      
    return(wordlist)

#Function to get the term frequency of a list of words
def get_term_frequency(wordlist):
    word_frequency = Counter(wordlist)
    total_words = len(word_frequency)
    term_frequency = {term: count / total_words for term, count in word_frequency.items()}
    return(term_frequency)

def get_tfidf(document_list):
    total_words = []
    term_frequency_list = []
    for book in books:
        wordlist = get_book_wordlist(book)
        total_words.append(wordlist)
        term_frequency = get_term_frequency(wordlist)
        term_frequency_list.append(term_frequency)

    #Create a list of all unique words in the texts
    all_words = set([word for sublist in total_words for word in sublist])

    #Get the document frequency for each word
    document_frequency = {word: sum(1 for sublist in total_words if word in sublist) for word in all_words}

    #Calculate Inverse Document Frequency for each word
    amount_documents = len(books)
    inverse_document_frequency = {word: math.log(amount_documents / document_frequency[word]) for word in document_frequency}

    #Get the 100 words with the highest tfidf score per book
    tfidf_list = []
    for book in term_frequency_list:
        #Calculate the tfidf per word
        tfidf = {word: term_frequency * inverse_document_frequency[word] for word, term_frequency in book.items()}
        #Sort the dictionary 
        tfidf = dict(sorted(tfidf.items(), key=lambda item: item[1], reverse=True))
        #Turn the words to a list and select the 100 words with the highest tfidf scores
        tfidf_list.append(list(tfidf.keys())[:100])
    return(tfidf_list)

In [81]:
books = ["Frankenstein.txt", "Great_Gatsby.txt", "The_Picture_of_Dorian_Gray.txt"]
print(get_tfidf(books))


[['clerval', 'endeavour', 'mountain', 'justine', 'felix', 'geneva', 'elizabeth', 'bestow', 'murderer', 'frankenstein', 'wretch', 'william', 'safie', 'fear', 'surround', 'depart', 'revenge', 'cottager', 'misery', 'overcome', 'm.', 'destruction', 'being', 'beloved', 'cottage', 'creator', 'alas', 'task', 'voyage', 'progress', 'dæmon', 'horror', 'acquaint', 'departure', 'thy', 'countenance', 'destroy', 'behold', 'ingolstadt', 'quit', 'hovel', 'visit', 'perceive', 'wood', 'agitation', 'sledge', 'fatigue', 'torment', 'amiable', 'wretchedness', 'benevolent', 'tranquillity', 'protector', 'condemn', 'disposition', 'frightful', 'exertion', 'contemplate', 'summit', 'magistrate', 'abhor', 'kirwin', 'gentle', 'pursue', 'allow', 'fiend', 'misfortune', 'monster', 'undertaking', 'apply', 'mont', 'labour', 'peace', 'enemy', 'agony', 'mankind', 'sustain', 'sympathise', 'permit', 'accordingly', 'compassion', 'enjoyment', 'specie', 'trial', 'reflection', 'crime', 'native', 'enterprise', 'region', 'gentlen