In [1]:
## Josh Blaz -- LOTR
## CS401 -- NLP

import nltk
import re
import urllib.request
import lxml.html as lh
import io
import requests
import os
import glob

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import Counter 
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import wordnet as wn
import mglearn 

#NOTE: Sentiment140 Polarity values: 0: negative, 2: neutral, 4: positive

#### NOTE: Elvish text is translated awkwardly into the .txt format    
</br>
##### IE:   
</br>
►M MPR -F+MTRX MP ft PPtK P&RMPht: P. t. The last Two runes are the initials of Thror and Thrain.**  
</br>
#### Same with some of the intros:
</br>
“THE LORD OF THE RINGS” 

Pjrt Thttt 

THE RETURN 
OF THE KING 


In [2]:
## Get all tokens

## --- The Silmarillion ---
silm_file = open('silmarillion.txt', 'r')
silm = silm_file.read() 
silm_raw = silm[43190:-5436] ## save raw files for later
silm = silm.lower() ## Make all words lowercase 
silm = silm[43190:-5436] ## remove HTML jargon
silm = nltk.word_tokenize(silm) ## tokenize

## --- The Hobbit ---
hobbit_file = open('hobbit.txt', 'r')
hobbit = hobbit_file.read()
hobbit_raw = hobbit[43212:-8543]
hobbit = hobbit.lower()
hobbit = hobbit[43212:-8543]
hobbit = nltk.word_tokenize(hobbit)

## --- The Fellowship of the Ring ---
fellowship_file = open('fellowship.txt', 'r')
fellowship = fellowship_file.read()
fellowship_raw = fellowship[43242:-5436]
fellowship = fellowship.lower()
fellowship = fellowship[43242:-5436]
fellowship = nltk.word_tokenize(fellowship)

## --- The Two Towers ---
twotowers_file = open('twotowers.txt', 'r')
twotowers = twotowers_file.read()
twotowers_raw = twotowers[43302:-19245]
twotowers = twotowers.lower()
twotowers = twotowers[43302:-19245]
twotowers = nltk.word_tokenize(twotowers)

## --- The Return of the King ---
ret_file = open('return.txt', 'r')
ret = ret_file.read()
ret_raw = ret[43252:-5434]
ret = ret.lower()
ret = ret[43252:-5434]
ret = nltk.word_tokenize(ret)

tokenlist = [silm, hobbit, fellowship, twotowers, ret]

raw_texts = [silm_raw, hobbit_raw, fellowship_raw, twotowers_raw, ret_raw] #Text files to use with LDA topic modeling


entirety = [] ## This is all tokens combined
for i in range(len(tokenlist)):
    for word in tokenlist[i]:
        entirety.append(word)
        

In [3]:
# Used Chapterize to split books into chapters 
## https://github.com/JonathanReeve/chapterize
### Chapterize didn't work 100% perfectly, so I had to go through and the prologues back in when it cut them out

# These are lists containing strings of every chapter for each book
silm_chapters = []
hobbit_chapters = []
fellowship_chapters = []
twotowers_chapters = []
return_chapters = []

# Paths to directories storing book chapters
list_of_paths = ['/Users/blaz/Desktop/LOTR/silmarillion-chapters', '/Users/blaz/Desktop/LOTR/hobbit-chapters',\
                '/Users/blaz/Desktop/LOTR/fellowship-chapters', '/Users/blaz/Desktop/LOTR/twotowers-chapters',\
                '/Users/blaz/Desktop/LOTR/return-chapters']

for path in list_of_paths: # iterate through the list of folder paths for each book
    for file in sorted(glob.glob(os.path.join(path,'*.txt'))): # This gives us a sorted list of the files in each directory                                                         
        f = open(file, 'r') # open and read file               # allowing us to read in the chapters in order.
        txt = f.read()
        ## determine which path we're using and append it to the correct book chapter list
        if path == '/Users/blaz/Desktop/LOTR/silmarillion-chapters': 
            # Because of an issue with 'glob', I had to create a copy of the final chapter in The Silmarillion
            silm_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/hobbit-chapters':
            hobbit_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/fellowship-chapters': 
            fellowship_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/twotowers-chapters': 
            twotowers_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/return-chapters': 
            return_chapters.append(txt)
        

In [4]:
# Store chapter names for use in dataframes later

silm_chapter_names = ["Ainundalë", "Valaquenta", "Of the Beginning of Days", "Of Aulë and Yavanna" , "Of the Coming of the Elves and the Captivity of Melkor",\
                     "Of Thingol and Melian", "Of Eldamar and the Princes of the Eldalië", "Of Fëanor and the Unchaining of Melkor", "Of the Silmarils and the Unrest of the Noldor",\
                     "Of the Darkening of Valinor", "Of the Flight of the Noldor", "Of the Sindar", "Of the Sun and Moon and the Hiding of Valinor", "Of Men", "Of the Return of the Noldor",\
                     "Of Beleriand and its Realms", "Of the Noldor in Beleriand", "Of Maeglin", "Of the Coming of Men into the West", "Of the Ruin of Beleriand and the Fall of Fingolfin", "Of Beren and Lúthien",\
                     "Of the Fifth Battle: Nirnaeth Arnoediad", "Of Túrin Turambar", "Of the Ruin of Doriath", "Of Tuor and the Fall of Gondolin", "Of the Voyage of Eärendil and the War of Wrath", \
                     "Akallabêth: The Downfall of Númenor", "Of the Rings of Power and the Third Age"]

hobbit_chapter_names = ["An Unexpected Party", "Roast Mutton", "A Short Rest", "Over Hill and Under Hill", "Riddles In The Dark", \
                       "Out Of The Frying-Pan Into The Fire", "Queer Lodgings", "Flies And Spiders", "Barrels Out Of Bond", "A Warm Welcome", \
                       "On The Doorstep", "Inside Information", "Not At Home", "Fire And Water", "The Gathering Of The Clouds", "A Thief In The Night", \
                       "The Clouds Burst", "The Return Journey", "The Last Stage"]

fellowship_chapter_names = ["Concerning Hobbits", "Concerning Pipeweed", "Of the Ordering of the Shire", "Note on the Shire Records", "A Long-expected Party", "The Shadow of the Past", \
                           "Three is Company", "A Short Cut to Mushrooms", "A Conspiracy Unmasked", "The Old Forest", "In the House of Tom Bombadil", "Fog on the Barrow-downs", "At the Sign of the Prancing Pony",\
                           "Strider", "A Knife in the Dark", "Flight to the Ford", "Many Meetings", "The Council of Elrond", "The Ring goes South", "A Journey in the Dark", "The Bridge of Khazad-dûm", \
                           "Lothlórien", "The Mirror of Galadriel", "Farewell to Lórien", "The Great River", "The Breaking of the Fellowship"]

twotowers_chapter_names = ["The Departure of Boromir", "The Riders of Rohan", "The Uruk-hai", "Treebeard", "The White Rider", "The King of the Golden Hall", "Helm's Deep", "The Road to Isengard", "Flotsam and Jetsam", \
                          "The Voice of Saruman", "The Palantír", "The Taming of Smeagol", "The Passage of the Marshes", "The Black Gate is Closed", "Of Herbs and Stewed Rabbit", "The Window of the West", "The Forbidden Pool", \
                          "Journey to the Cross-roads", "The Stairs to Cirith Ungol", "Shelob's Lair", "The Choices of Master Samwise"]

return_chapter_names = ["Minas Tirith", "The Passing of the Grey Company", "The Muster of Rohan", "The Siege of Gondor", "The Ride of the Rohirrim", "The Battle of the Pelennor Fields", "The Pyre of Denethor",\
                       "The Houses of Healing", "The Last Debate", "The Black Gate Opens", "The Tower of Cirith Ungol", "The Land of Shadow", "Mount Doom", "The Field of Cormallen", "The Steward and the King", \
                       "Many Partings", "Homeward Bound", "Scouring of the Shire", "The Grey Havens"]

chapter_name_list = [silm_chapter_names, hobbit_chapter_names, fellowship_chapter_names, twotowers_chapter_names, return_chapter_names]

# Sentiment Analysis


### ** 1. Segment all chapters into page-sized objects    **    
</br>
### ** 2. Send all segments to Sentiment140 API by chapter    **   
</br>
### ** 3. Calculate polarity averages and polarity lists. **   
</br>
### ** 4. Store API polarity ratings and export to csv**
</br>
### ** 5. Plot all polarities + Averages**       

In [5]:
"""
Function that segments given chapter into n-sized segments to be sent to the API.
Typically using n=2940, as this is the #chars in my copy of Fellowship of the Ring.

Parameters - chapter - chapter of a book to be broken into segments
           - n - length that we segment the text with
       
Returns a list of (string) segments of the chapter.
"""
def Segmenter(chapter, n):
    segments = []
    # start and end indices for segmenting the text
    start = 0
    end = n
    while end < len(chapter) + n:
        segments.append(chapter[start:end])
        start = end
        end = end + n
    return segments #segments of input chapter

In [6]:
# Goal:
# Create lists of lists for segments of each chapter of each book, append to them using "Segmenter" function, 
# storing them like this will allow for iterative querying of the API server


# Lists of Lists of Lists storing all segments of all chapters for each book
# [[chapter1 segment 0-2500, chap1, segmenet 2500-5000]... [chapter2 segment0-2500, ...]...]
silm_segments = []
hobbit_segments = []
fellowship_segments = []
twotowers_segments = []
return_segments = []

# List containing the lists storing each books' chapters
list_of_books = [silm_chapters, hobbit_chapters, fellowship_chapters, twotowers_chapters, return_chapters]
# List allowing us to access the segment lists
list_of_segments = [silm_segments, hobbit_segments, fellowship_segments, twotowers_segments, return_segments]

for i in range(len(list_of_books)):
    for chapter in list_of_books[i]: # Segment entire chapter using Segmenter function, with 2940 character cuts
        list_of_segments[i].append(Segmenter(chapter,2940))


In [7]:
"""
Function that sends segments of 1 chapter through the Sentiment140 API.
In order to do so, it creates and appends segments to a JSON file, then posts the JSON queries to the API server
using requests module (using an HTTP Post)

Parameters - chapter_segments - segments of an entire chapter of a book

Returns a list of polarities for segments of the chapter, as well as the polarity average for the chapter

Note: Maximum of 700,000 characters per API request, though this shouldn't be a problem
"""

def Polarity(chapter_segments): # segments of a single chapter
    request = {'data':[]}
    polarityList = []
    counter = 0
    for segment in chapter_segments: # Fill JSON
        request['data'].append({'text':segment})
    r = requests.post('http://www.sentiment140.com/api/bulkClassifyJson?appid=blaz_j1@denison.edu', json=request)
    jso = r.json()
    for i in range(len(request['data'])-1):
        polarityList.append(jso['data'][i]['polarity'])
    
    polarityTotal = 0
    for value in polarityList:
        polarityTotal = polarityTotal + value
    
    polarityAVG = polarityTotal/len(polarityList)
    return polarityList, polarityAVG

In [8]:
# store all chapter polarity averages along with all polarity ratings for each chapter
silm_polarity_avg = []
hobbit_polarity_avg = []
fellowship_polarity_avg = []
twotowers_polarity_avg = []
return_polarity_avg = []

silm_polarity_lists = []
hobbit_polarity_lists = []
fellowship_polarity_lists = []
twotowers_polarity_lists = []
return_polarity_lists = []
### Need to get chapter names in

for x in range(len(list_of_books)):
    book = list_of_books[x]
    segs = list_of_segments[x]
    
    for i in range(len(book)):
        if x == 0:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            silm_polarity_lists.append(temp1)
            silm_polarity_avg.append(temp2)
        if x == 1:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            hobbit_polarity_lists.append(temp1)
            hobbit_polarity_avg.append(temp2)
        if x == 2:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            fellowship_polarity_lists.append(temp1)
            fellowship_polarity_avg.append(temp2)
        if x == 3:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            twotowers_polarity_lists.append(temp1)
            twotowers_polarity_avg.append(temp2)
        if x == 4:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            return_polarity_lists.append(temp1)
            return_polarity_avg.append(temp2)

all_polarity_avgs = [silm_polarity_avg, hobbit_polarity_avg, fellowship_polarity_avg, twotowers_polarity_avg, return_polarity_avg]

all_polarity_lists = [silm_polarity_lists, hobbit_polarity_lists, fellowship_polarity_lists, twotowers_polarity_lists, return_polarity_lists]  
# chapter 3 of return of the king is super dark

In [9]:
# Commented this all out so I don't reset my excel work every time
"""
# Converting Polarity AVG data into pandas dataframes
## These CSVs store all average chapter polarities for each book
silm_df = pd.DataFrame(silm_polarity_avg, index = silm_chapter_names, columns = ["Polarity"])
silm_df = silm_df.rename_axis("--- The Silmarillion ---")
#silm_df.to_csv("silm_df.csv")

hobbit_df = pd.DataFrame(hobbit_polarity_avg, index = hobbit_chapter_names, columns = ["Polarity"])
hobbit_df = hobbit_df.rename_axis("--- The Hobbit ---")
#hobbit_df.to_csv("hobbit_df.csv")

fellowship_df = pd.DataFrame(fellowship_polarity_avg, index = fellowship_chapter_names, columns = ["Polarity"])
fellowship_df = fellowship_df.rename_axis("--- The Fellowship of the Ring ---")
# Prologue chapters have weird polarities - have solid values because they're shorter
#fellowship_df.to_csv("fellowship_df.csv")

twotowers_df = pd.DataFrame(twotowers_polarity_avg, index = twotowers_chapter_names, columns = ["Polarity"])
twotowers_df = twotowers_df.rename_axis("--- The Two Towers ---")
#twotowers_df.to_csv("twotowers_df.csv")

return_df = pd.DataFrame(return_polarity_avg, index = return_chapter_names, columns = ["Polarity"])
return_df = return_df.rename_axis("--- The Return of the King ---")
#return_df.to_csv("return_df.csv")

# Dataframe of all Books overlaid
books_df = [silm_df, hobbit_df, fellowship_df, twotowers_df, return_df]
full_df = pd.concat(books_df)

# Export to CSV
full_df.to_csv("full_df.csv")


# Exported to Excel as well, for simpler plots
excel = pd.ExcelWriter('LOTR1.xlsx')
silm_df.to_excel(excel, 'The Silmarillion')
hobbit_df.to_excel(excel, 'The Hobbit')
fellowship_df.to_excel(excel, 'Fellowship of the Rings')
twotowers_df.to_excel(excel, 'The Two Towers')
return_df.to_excel(excel, 'The Return of the King')
full_df.to_excel(excel, 'Combined')
excel.save()
"""

'\n# Converting Polarity AVG data into pandas dataframes\n## These CSVs store all average chapter polarities for each book\nsilm_df = pd.DataFrame(silm_polarity_avg, index = silm_chapter_names, columns = ["Polarity"])\nsilm_df = silm_df.rename_axis("--- The Silmarillion ---")\n#silm_df.to_csv("silm_df.csv")\n\nhobbit_df = pd.DataFrame(hobbit_polarity_avg, index = hobbit_chapter_names, columns = ["Polarity"])\nhobbit_df = hobbit_df.rename_axis("--- The Hobbit ---")\n#hobbit_df.to_csv("hobbit_df.csv")\n\nfellowship_df = pd.DataFrame(fellowship_polarity_avg, index = fellowship_chapter_names, columns = ["Polarity"])\nfellowship_df = fellowship_df.rename_axis("--- The Fellowship of the Ring ---")\n# Prologue chapters have weird polarities - have solid values because they\'re shorter\n#fellowship_df.to_csv("fellowship_df.csv")\n\ntwotowers_df = pd.DataFrame(twotowers_polarity_avg, index = twotowers_chapter_names, columns = ["Polarity"])\ntwotowers_df = twotowers_df.rename_axis("--- The Two T

In [10]:
# Again, commented out data exports
"""
# Converting Polarity List data into pandas dataframes
## These CSVs store all polarity ratings for each book, rather than average chapter polarity ratings
### These dataframes are absolutely unusable, the data is just too hard to viz
silm_all_pol = []
for i in range(len(silm_polarity_lists)):
    for polarity in silm_polarity_lists[i]:
        silm_all_pol.append(polarity)
silm_all_pol = pd.DataFrame(silm_all_pol, columns = ["Polarity"])

hobbit_all_pol = []
for i in range(len(hobbit_polarity_lists)):
    for polarity in hobbit_polarity_lists[i]:
        hobbit_all_pol.append(polarity)
hobbit_all_pol = pd.DataFrame(hobbit_all_pol, columns = ["Polarity"])

fellowship_all_pol = []
for i in range(len(fellowship_polarity_lists)):
    for polarity in fellowship_polarity_lists[i]:
        fellowship_all_pol.append(polarity)
fellowship_all_pol = pd.DataFrame(fellowship_all_pol, columns = ["Polarity"])

twotowers_all_pol = []
for i in range(len(twotowers_polarity_lists)):
    for polarity in twotowers_polarity_lists[i]:
        twotowers_all_pol.append(polarity)
twotowers_all_pol = pd.DataFrame(twotowers_all_pol, columns = ["Polarity"])

return_all_pol = []
for i in range(len(return_polarity_lists)):
    for polarity in return_polarity_lists[i]:
        return_all_pol.append(polarity)
return_all_pol = pd.DataFrame(return_all_pol, columns = ["Polarity"])

all_pol_list = [silm_all_pol, hobbit_all_pol, fellowship_all_pol, twotowers_all_pol, return_all_pol]
all_pol = pd.concat(all_pol_list)

excel2 = pd.ExcelWriter('LOTR2.xlsx')
silm_all_pol.to_excel(excel2, 'The Silmarillion')
hobbit_all_pol.to_excel(excel2, 'The Hobbit')
fellowship_all_pol.to_excel(excel2, 'Fellowship of the Rings')
twotowers_all_pol.to_excel(excel2, 'The Two Towers')
return_all_pol.to_excel(excel2, 'The Return of the King')
all_pol.to_excel(excel2, 'Combined')

excel2.save()
"""

'\n# Converting Polarity List data into pandas dataframes\n## These CSVs store all polarity ratings for each book, rather than average chapter polarity ratings\n### These dataframes are absolutely unusable, the data is just too hard to viz\nsilm_all_pol = []\nfor i in range(len(silm_polarity_lists)):\n    for polarity in silm_polarity_lists[i]:\n        silm_all_pol.append(polarity)\nsilm_all_pol = pd.DataFrame(silm_all_pol, columns = ["Polarity"])\n\nhobbit_all_pol = []\nfor i in range(len(hobbit_polarity_lists)):\n    for polarity in hobbit_polarity_lists[i]:\n        hobbit_all_pol.append(polarity)\nhobbit_all_pol = pd.DataFrame(hobbit_all_pol, columns = ["Polarity"])\n\nfellowship_all_pol = []\nfor i in range(len(fellowship_polarity_lists)):\n    for polarity in fellowship_polarity_lists[i]:\n        fellowship_all_pol.append(polarity)\nfellowship_all_pol = pd.DataFrame(fellowship_all_pol, columns = ["Polarity"])\n\ntwotowers_all_pol = []\nfor i in range(len(twotowers_polarity_list

In [11]:
"""
Function that tokenizes and cleans the words of every chapter in a book.

Parameters - book - a book.

Returns a List of Lists storing a tokenized list for every chapter in a book.
"""

def Tokenize(book):
    punctuation = ".,;!?:`'()’■''" ## including other symbols
    token_list = []
    for chapter in book:
        temp = []
        words = nltk.word_tokenize(chapter)
        for word in words:
            word = word.lower()
            if word not in punctuation and not word.isnumeric(): 
                temp.append(word)
        token_list.append(temp)
        
    return token_list


#tokens = (Tokenize(silm_chapters))
#print(tokens[0]) ## Tokens of first chapter of The Silmarillion

In [12]:
"""
Function that returns the n most common words for every chapter in a book.
This is accomplished by using 'Counter' in the 'Collections' module.

Parameters - book - a tokenized list of lists of all chapters of a book
          - n - number of most common words in the chapter
          
Returns a List of Lists of the n most common words of every chapter in the book.
"""

def MostCommon(book, n): 
    stop_words = set(stopwords.words('english')) 
    
    # list of common character names
    names = ["gandalf", "merry", "pippin", "frodo", "sam", "aragorn", "faramir", "denethor", "gimli",\
             "legolas", "strider", "boromir", "jowyn", "jomer", "beregond", "gollum", "bilbo", "thorin"] 
    
    
    tolkien_stop = ["men","great", "'s", "said", "went", "he", "would", "many", "one", "he", "came", "yet", "even", "shall", \
                   "upon", "days", "looked", "n't", "back", "could", "'ll", "'ve", "come", "still", "'i", "yield" ]
    
    
    # Have to get rid of a lot of words, I call these "tolkien stop words", the silmarillion is full of these,
    # in LOTR, 'great' and 'men' appear very often
    
    common_words = []
    for chapter in book:
        temp = []
        for word in chapter:
            if word.isalpha() and word not in stop_words and word not in names and word not in tolkien_stop:
                temp.append(word)
                  
        common_words.append(Counter(temp).most_common(n))
    return common_words

In [13]:
## Find top 5 NON-STOP words per chapter

silm_chapters_tokenized = []
hobbit_chapters_tokenized = []
fellowship_chapters_tokenized = []
twotowers_chapters_tokenized = []
return_chapters_tokenized = []

silm_chapters_common = []
hobbit_chapters_common = []
fellowship_chapters_common = []
twotowers_chapters_common = []
return_chapters_common = []

for i in range(len(list_of_books)):
    if i == 0:
        silm_chapters_tokenized = Tokenize(list_of_books[i])
        silm_chapters_common = MostCommon(silm_chapters_tokenized, 5)
    if i == 1:
        hobbit_chapters_tokenized = Tokenize(list_of_books[i])
        hobbit_chapters_common = MostCommon(hobbit_chapters_tokenized, 5)
    if i == 2:
        fellowship_chapters_tokenized = Tokenize(list_of_books[i])
        fellowship_chapters_common = MostCommon(fellowship_chapters_tokenized, 5)
    if i == 3:
        twotowers_chapters_tokenized = Tokenize(list_of_books[i])
        twotowers_chapters_common = MostCommon(twotowers_chapters_tokenized, 5)
    if i == 4:
        return_chapters_tokenized = Tokenize(list_of_books[i])
        return_chapters_common = MostCommon(return_chapters_tokenized, 5)

In [14]:
## Common words in Full-Negative chapters

print(fellowship_chapter_names[14])
print(fellowship_chapters_common[14])

print(twotowers_chapter_names[0])
print(twotowers_chapters_common[0])

print(return_chapter_names[4])
print(return_chapters_common[4])

print(return_chapter_names[5])
print(return_chapters_common[5])

print(return_chapter_names[6])
print(return_chapters_common[6])

print(return_chapter_names[9])
print(return_chapters_common[9])

print(return_chapter_names[12])
print(return_chapters_common[12])


A Knife in the Dark
[('last', 23), ('us', 23), ('away', 22), ('left', 22), ('road', 22)]
The Departure of Boromir
[('ores', 24), ('away', 12), ('long', 11), ('boat', 11), ('wind', 11)]
The Ride of the Rohirrim
[('wild', 25), ('king', 21), ('road', 17), ('like', 15), ('away', 14)]
The Battle of the Pelennor Fields
[('king', 27), ('like', 17), ('fell', 17), ('black', 16), ('city', 14)]
The Pyre of Denethor
[('lord', 17), ('stood', 14), ('city', 14), ('door', 12), ('away', 11)]
The Black Gate Opens
[('mordor', 17), ('black', 17), ('sauron', 16), ('last', 14), ('away', 13)]
Mount Doom
[('mountain', 27), ('master', 24), ('away', 24), ('dark', 24), ('last', 24)]


In [15]:
# Hobbit
# 8 Chapters with polarity below 0.5

print(hobbit_chapter_names[4])
print(hobbit_chapters_common[4])

print(hobbit_chapter_names[10])
print(hobbit_chapters_common[10])

print(hobbit_chapter_names[12])
print(hobbit_chapters_common[12])

print(hobbit_chapter_names[13])
print(hobbit_chapters_common[13])


Riddles In The Dark
[('yes', 25), ('way', 24), ('got', 23), ('goblins', 23), ('dark', 22)]
On The Doorstep
[('mountain', 14), ('day', 12), ('dwarves', 11), ('valley', 10), ('river', 9)]
Not At Home
[('light', 23), ('dwarves', 17), ('smaug', 12), ('long', 12), ('door', 11)]
Fire And Water
[('bard', 21), ('master', 17), ('town', 16), ('dragon', 16), ('lake', 14)]


In [16]:
## Create list of Full-Negative Chapters

"""
A Knife in the Dark
The Departure of Boromir
The Ride of the Rohirrim
The Battle of the Pelennor Fields
The Pyre of Denethor
The Black Gate Opens
Mount Doom
"""

negative_chapters = []

negative_chapters.append(fellowship_chapters[14])
negative_chapters.append(twotowers_chapters[0])
negative_chapters.append(return_chapters[4])
negative_chapters.append(return_chapters[5])
negative_chapters.append(return_chapters[6])
negative_chapters.append(return_chapters[9])
negative_chapters.append(return_chapters[12])

# Topic Modeling

LDA - Latent Dirichlet Allocation 
- tokenize
- remove stop words
- lemmatize tokens
- vectorize
- model


In [53]:
"""
Function that lemmatizes a word.

Parameters - word - word to be lemmatized

Returns the lemmatized version of the word.
"""
def lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
print(lemma("hills"))

hill


In [54]:
"""
This function prepares tokens to be sent through the LDA process. In doing so, it needs to make sure the tokens are
words, they aren't in the 'Tolkien Stop Words' and that they are lemmas.
The Latent Dirichlet Allocation function will remove stop words and ignore punctuation.

Parameters - tokens - tokens of a chapter to lematize.

Returns a list of tokens that are ready to be joined back together for Latent Dirichlet Allocation.
"""
def LDA_prepare(tokens):
    ret = []
    lemmatized = []
    tolkien_stop = ["men","great", "'s", "said", "went", "he", "would", "many", "one", "he", "came", "yet", "even", "shall", \
                   "upon", "days", "looked", "n't", "back", "could", "'ll", "'ve", "come", "still", "gate", "'i" ]
    
    names = ["gandalf", "merry", "pippin", "frodo", "sam", "aragorn", "faramir", "denethor", "gimli",\
        "legolas", "strider", "boromir", "jowyn", "jomer", "beregond", "gollum", "bilbo", "thorin"] 
    
    for word in tokens:
        word = word.lower()
        if word.isalpha() and word not in tolkien_stop and word not in names:
            ret.append(word)
            
    for word in ret:
        lemmatized.append(lemma(word))
        
    return lemmatized

In [38]:
"""
Function that prepares a chapter for LDA and then performs LDA.

Parameters - chapter - chapter to perform LDA on
           - topics - number of topics in the LDA model
           
Prints the topics found in the model.
"""
def LDA_function(chapter, topics):
    
    # Initialize Vectorizer and LDA model 
    vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
    lda = LatentDirichletAllocation(n_components=topics, learning_method="batch",
                                    max_iter=200, random_state=1)
    
    # Tokenize
    tokens = nltk.word_tokenize(chapter)
    # Prepare for LDA
    tokens = LDA_prepare(tokens)
    
    # Join back together
    join = [''.join(filter(str.isalpha, word)) for word in tokens]
    #Vectorize
    vec = vect.fit_transform(join)
    document_topics = lda.fit_transform(vec)
    
    print("lda.components_.shape: {}".format(lda.components_.shape))
    lda.components_.shape: (10, 10000)

    sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

    feature_names = np.array(vect.get_feature_names())

    mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)
    
LDA_function(negative_chapters[5], 100)

lda.components_.shape: (100, 1039)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
fell          young         young         young         young         
young         fires         fires         fires         fires         
foolish       friend        friend        friend        friend        
freshened     freshened     freshened     freshened     freshened     
fourth        fourth        fourth        fourth        fourth        
foul          foul          foul          foul          foul          
forward       forward       forward       forward       forward       
fortune       fortune       fortune       fortune       fortune       
forth         forth         forth         forth         forth         
forlorn       forlorn       forlorn       forlorn       forlorn       


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------    

In [55]:
vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                    max_iter=500, random_state=1)

Knife = negative_chapters[0]
KT = nltk.word_tokenize(Knife)
KT = LDA_prepare(KT)
join = [''.join(filter(str.isalpha, word)) for word in KT]
vec = vect.fit_transform(join)
document_topics = lda.fit_transform(vec)

print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)

lda.components_.shape: (10, 1313)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
black         round         house         make          saw           
eyes          weathertop    run           foot          old           
stand         sign          thought       time          hobbit        
ask           slowly        hand          hope          day           
enemy         felt          fell          dark          shadow        
fear          soon          ha            ring          answer        
near          turn          sky           line          star          
tall          ground        look          open          right         
path          looking       suddenly      stone         elves         
hollow        hair          north         slope         use           


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------     

In [56]:
vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                    max_iter=50, random_state=1)

Knife = negative_chapters[0]
KT = nltk.word_tokenize(Knife)
KT = LDA_prepare(KT)
join = [''.join(filter(str.isalpha, word)) for word in KT]
vec = vect.fit_transform(join)
document_topics = lda.fit_transform(vec)

print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)

lda.components_.shape: (10, 1313)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
black         round         house         make          saw           
eyes          weathertop    run           foot          old           
stand         sign          fell          time          day           
ask           slowly        hand          hope          hobbit        
enemy         soon          thought       dark          shadow        
fear          felt          sky           ring          answer        
near          turn          ha            line          star          
tall          looking       suddenly      open          right         
path          ground        north         stone         elves         
cut           hair          look          grey          use           


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------     

In [44]:
vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                    max_iter=50, random_state=1)

Knife = negative_chapters[1]
KT = nltk.word_tokenize(Knife)
KT = LDA_prepare(KT)
join = [''.join(filter(str.isalpha, word)) for word in KT]
vec = vect.fit_transform(join)
document_topics = lda.fit_transform(vec)

print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)

lda.components_.shape: (10, 676)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
did           left          ores          wind          news          
follow        minas         long          use           white         
towers        rauros        boat          time          hobbits       
parth         strange       taken         water         hand          
golden        let           river         high          grey          
red           beneath       way           end           ask           
choice        set           turned        far           longer        
save          heard         fallen        read          like          
dead          clear         slain         anduin        sea           
answered      bent          roaring       earth         ground        


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      

In [45]:
vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                    max_iter=50, random_state=1)

Knife = negative_chapters[2]
KT = nltk.word_tokenize(Knife)
KT = LDA_prepare(KT)
join = [''.join(filter(str.isalpha, word)) for word in KT]
vec = vect.fit_transform(join)
document_topics = lda.fit_transform(vec)

print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)

lda.components_.shape: (10, 859)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
road          wild          ride          king          man           
thjoden       ghvn          horses        old           elfhelm       
dark          gondor        time          long          darkness      
suddenly      dernhelm      day           hills         morning       
spoke         taken         set           company       stone         
felt          hand          light         tall          walls         
night         dread         hope          seen          black         
wind          doubt         voice         wall          hear          
late          answered      began         nearer        hour          
years         ago           held          left          leading       


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      

In [46]:
vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                    max_iter=50, random_state=1)

Knife = negative_chapters[3]
KT = nltk.word_tokenize(Knife)
KT = LDA_prepare(KT)
join = [''.join(filter(str.isalpha, word)) for word in KT]
vec = vect.fit_transform(join)
document_topics = lda.fit_transform(vec)

print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)

lda.components_.shape: (10, 1005)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
king          eyes          stood         like          battle        
fell          lord          field         away          foes          
left          gondor        turned        lay           face          
beast         thjoden       far           rode          clear         
snowmane      new           sea           knights       borne         
hope          dark          shadow        long          walls         
knew          ships         heard         river         standard      
hosts         world         hour          red           wrath         
corsairs      cold          thy           wept          filled        
foe           saw           drove         day           lifted        


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------     

In [47]:
vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                    max_iter=50, random_state=1)

Knife = negative_chapters[4]
KT = nltk.word_tokenize(Knife)
KT = LDA_prepare(KT)
join = [''.join(filter(str.isalpha, word)) for word in KT]
vec = vect.fit_transform(join)
document_topics = lda.fit_transform(vec)

print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)

lda.components_.shape: (10, 699)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
set           light         away          son           servants      
heart         guard         house         healing       eyes          
turned        cried         enemy         thy           despair       
power         black         way           death         grey          
passed        brought       battle        hands         far           
table         dark          place         madness       tower         
bier          swiftly       fear          spoke         lies          
saw           shadowfax     hastened      mind          pass          
closed        drew          know          face          taken         
fields        dead          left          dnnen         broke         


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      

In [48]:
vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                    max_iter=50, random_state=1)

Knife = negative_chapters[5]
KT = nltk.word_tokenize(Knife)
KT = LDA_prepare(KT)
join = [''.join(filter(str.isalpha, word)) for word in KT]
vec = vect.fit_transform(join)
document_topics = lda.fit_transform(vec)

print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)

lda.components_.shape: (10, 1039)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
black         mordor        hills         away          enemy         
day           far           long          west          ores          
army          cried         tower         heard         destroyed     
fear          pass          messenger     did           maybe         
forth         north         set           soon          assault       
left          wish          laughed       red           way           
tall          banner        minas         east          scouts        
south         grey          land          began         blew          
mind          saw           march         anduin        watched       
nazgyl        silence       remained      enemies       time          


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------     

In [49]:
vect = CountVectorizer(max_features=10000, max_df=.15, stop_words='english')
    
lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                    max_iter=50, random_state=1)

Knife = negative_chapters[6]
KT = nltk.word_tokenize(Knife)
KT = LDA_prepare(KT)
join = [''.join(filter(str.isalpha, word)) for word in KT]
vec = vect.fit_transform(join)
document_topics = lda.fit_transform(vec)

print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)

lda.components_.shape: (10, 1363)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
away          mountain      dark          eyes          thing         
day           saw           like          strength      dreadful      
hand          turned        fell          heart         feet          
long          road          fear          end           burden        
stood         night         thought       path          plain         
doom          voice         took          little        lying         
gave          knew          miles         lay           worn          
high          wild          clouds        rose          mordor        
moment        set           north         west          coming        
grew          power         effort        gone          tearing       


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------     

# Word Search

In [80]:
## Calculate all frequencies

silm_freq = {}
hobbit_freq = {}
fellowship_freq = {}
twotowers_freq = {}
return_freq = {}

freq_dicts = [silm_freq, hobbit_freq, fellowship_freq, twotowers_freq, return_freq]

for i in range(len(tokenlist)):
    for word in tokenlist[i]:
        word = word.lower()
        if word not in punctuation and not word.isnumeric() and word.isalpha():
            if word in freq_dicts[i]:
                freq_dicts[i][word] += 1
            else:
                freq_dicts[i][word] = 1

entirety_dict = {}

for word in entirety:
    word = word.lower()
    if word not in punctuation and not word.isnumeric() and word.isalpha():
        if word in entirety_dict:
            entirety_dict[word] += 1
        else:
            entirety_dict[word] = 1
            

In [83]:
"""
Function that returns the frequency of a given word in each book:

Parameters - word - input word

Returns the frequency of the input word in each book.
"""

#NOTE:: For whatever reason, the words "orc" and "orcs" were converted to "ore" and "ores"
## I suppose this is the drawback of pulling 5 books off the internet


def WordSearch(word):
    word = word.lower() 
    for i in range(len(tokenlist)):
        if i == 0:
            if word in freq_dicts[i]:
                print("------------------------------------------------------")
                print("The Silmarillion:", freq_dicts[i][word], "Out of 148,914 Words")
                print("------------------------------------------------------")
            else:
                print("------------------------------------------------------")
                print("The Silmarillion: 0 Out of 148,914 Words")   
                print("------------------------------------------------------")
        if i == 1:
            if word in freq_dicts[i]:
                print("The Hobbit:", freq_dicts[i][word], "Out of 96,180 Words")
                print("------------------------------------------------------")
            else:
                print("The Hobbit: 0 Out of 96,180 Words")
                print("------------------------------------------------------")
        if i == 2:
            if word in freq_dicts[i]:
                print("The Fellowship of the Ring:", freq_dicts[i][word], "Out of 182,858 Words")
                print("------------------------------------------------------")
            else:
                print("The Fellowship of the Ring: 0 Out of 182,858 Words")
                print("------------------------------------------------------")
        if i == 3:
            if word in freq_dicts[i]:
                print("The Two Towers:", freq_dicts[i][word], "Out of 155,947 Words")
                print("------------------------------------------------------")
            else:
                print("The Two Towers: 0 Out of 155,947 Words")
                print("------------------------------------------------------")
        if i == 4:
            if word in freq_dicts[i]:
                print("The Return of the King:", freq_dicts[i][word], "Out of 132,059 Words")
                print("------------------------------------------------------")
            else:
                print("The Return of the King: 0 Out of 132,059 Words")
                print("------------------------------------------------------")
    if word in entirety_dict:
        print("Entire Corpus:", entirety_dict[word])
        print("------------------------------------------------------")
                
WordSearch("ores")


------------------------------------------------------
The Silmarillion: 130 Out of 148,914 Words
------------------------------------------------------
The Hobbit: 2 Out of 96,180 Words
------------------------------------------------------
The Fellowship of the Ring: 54 Out of 182,858 Words
------------------------------------------------------
The Two Towers: 220 Out of 155,947 Words
------------------------------------------------------
The Return of the King: 61 Out of 132,059 Words
------------------------------------------------------
Entire Corpus: 467
------------------------------------------------------


# Word Clouds

In [119]:
## Lists of words to go to word clouds
silm_cloud = []
hobbit_cloud = []
fellowship_cloud = []
twotowers_cloud = []
return_cloud = []
total_list = []

cloudlist = [silm_cloud, hobbit_cloud, fellowship_cloud, twotowers_cloud, return_cloud]

names = ["gandalf", "merry", "pippin", "frodo", "sam", "aragorn", "faramir", "denethor", "gimli",\
        "legolas", "strider", "boromir", "jowyn", "jomer", "beregond", "gollum", "bilbo", "thorin"] 

for i in range(len(tokenlist)):
    for word in tokenlist[i]:
        word=word.lower()
        if word not in punctuation and word not in stop_words and word.isalpha() and word not in tolkien_stop and word not in names:
            if freq_dicts[i][word] > 5: ## Only words that occur 5 times or more
                cloudlist[i].append(word)
                total_list.append(word)
## Dataframes to send to Tableau

silm_cloud_df = pd.DataFrame(cloudlist[0], columns = ["The Silmarillion"])
hobbit_cloud_df = pd.DataFrame(cloudlist[1], columns = ["The Hobbit"])
fellowship_cloud_df = pd.DataFrame(cloudlist[2], columns = ["The Fellowship of the Ring"])
twotowers_cloud_df = pd.DataFrame(cloudlist[3], columns = ["The Two Towers"])
return_cloud_df = pd.DataFrame(cloudlist[4], columns = ["The Return of the King"])
combined = pd.DataFrame(total_list)


cloud = pd.ExcelWriter('cloud.xlsx')

silm_cloud_df.to_excel(cloud, 'The Silmarillion')
hobbit_cloud_df.to_excel(cloud, 'The Hobbit')
fellowship_cloud_df.to_excel(cloud, 'Fellowship of the Rings')
twotowers_cloud_df.to_excel(cloud, 'The Two Towers')
return_cloud_df.to_excel(cloud, 'The Return of the King')
combined.to_excel(cloud, 'Combined')



cloud.save()



In [216]:
# Find common words within Negative Chapters
negative_tokens = []
negative_words = []

tolkien_stop = ["men","great", "'s", "said", "went", "he", "would", "many", "one", "he", "came", "yet", "even", "shall", \
                "upon", "days", "looked", "n't", "back", "could", "'ll", "'ve", "come", "still", "gate", "'i" ]
names = ["gandalf", "merry", "pippin", "frodo", "sam", "aragorn", "faramir", "denethor", "gimli",\
        "legolas", "strider", "boromir", "jowyn", "jomer", "beregond", "gollum", "bilbo", "thorin"] 
punctuation = ".,;!?:`'()’■''" ## including other symbols

for chapter in negative_chapters:
    negative_tokens.append(nltk.word_tokenize(chapter))
  
for i in range(len(negative_tokens)):
    for word in negative_tokens[i]:
        word = word.lower()
        if word.isalpha() and word not in stop_words and word not in tolkien_stop and word not in names:
            negative_words.append(word)

            
neg_df = pd.DataFrame(negative_words, columns = ["Negative Chapter Words"])                 
neg = pd.ExcelWriter('negative.xlsx')
neg_df.to_excel(neg, 'Sheet 1')
neg.save()

15956


In [219]:
### Positive Word Cloud Exports



positive_chapters = [silm_chapters[15], fellowship_chapters[15], fellowship_chapters[10], fellowship_chapters[23], \
                    return_chapters[14], return_chapters[15], fellowship_chapters[1], silm_chapters[1], silm_chapters[2],\
                    silm_chapters[3], silm_chapters[7]]

positive_tokens = []
positive_words = []

for chapter in positive_chapters:
    positive_tokens.append(nltk.word_tokenize(chapter))
  
for i in range(len(positive_tokens)):
    for word in positive_tokens[i]:
        word = word.lower()
        if word.isalpha() and word not in stop_words and word not in tolkien_stop and word not in names:
            positive_words.append(word)

            
pos_df = pd.DataFrame(positive_words, columns = ["Positive Chapter Words"])                 
pos = pd.ExcelWriter('positive.xlsx')
pos_df.to_excel(pos, 'Sheet 1')
pos.save()


20405


In [214]:
# Word cloud with tolkien stop words
from collections import Counter 

wordlist = []

for word in entirety:
    word = word.lower()
    if word.isalpha() and word not in stop_words:
            wordlist.append(word)


print(len(wordlist))
countlist = []
wordlist = []

temp = Counter(wordlist).most_common(15000)
print(temp)

for word in temp:
    wordlist.append(temp[0])
    countlist.append(temp[1])
    

    
    
count_df = pd.DataFrame(countlist, columns = ['Count'])
word_df = pd.DataFrame(wordlist, columns = ['Word'])

tol = pd.ExcelWriter('tolkien_stop.xlsx')

count_df.to_excel(tol, 'Sheet 1')
word_df.to_excel(tol, 'Sheet 2')
tol.save()


337155
[]
