In [1]:
## Josh Blaz -- LOTR
## CS401 -- NLP

import nltk
import re
import urllib.request
import lxml.html as lh
import io
import requests
import os
import glob

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from nltk.corpus import stopwords
from collections import Counter 

from sklearn.decomposition import LatentDirichletAllocation

#NOTE: Sentiment140 Polarity values: 0: negative, 2: neutral, 4: positive

#### **NOTE: Elvish text is translated awkwardly into the .txt format    
</br>
##### IE:   
</br>
#### ►M MPR -F+MTRX MP ft PPtK P&RMPht: P. t. The last Two runes are the initials of Thror and Thrain.**

In [2]:
# Used Chapterize to split books into chapters 
## https://github.com/JonathanReeve/chapterize
### Chapterize didn't work 100% perfectly, so I had to go through and the prologues back in when it cut them out

# These are lists containing strings of every chapter for each book
silm_chapters = []
hobbit_chapters = []
fellowship_chapters = []
twotowers_chapters = []
return_chapters = []

# Paths to directories storing book chapters
list_of_paths = ['/Users/blaz/Desktop/LOTR/silmarillion-chapters', '/Users/blaz/Desktop/LOTR/hobbit-chapters',\
                '/Users/blaz/Desktop/LOTR/fellowship-chapters', '/Users/blaz/Desktop/LOTR/twotowers-chapters',\
                '/Users/blaz/Desktop/LOTR/return-chapters']

for path in list_of_paths: # iterate through the list of folder paths for each book
    for file in sorted(glob.glob(os.path.join(path,'*.txt'))): # This gives us a sorted list of the files in each directory                                                         
        f = open(file, 'r') # open and read file               # allowing us to read in the chapters in order.
        txt = f.read()
        ## determine which path we're using and append it to the correct book chapter list
        if path == '/Users/blaz/Desktop/LOTR/silmarillion-chapters': 
            # Because of an issue with 'glob', I had to create a copy of the final chapter in The Silmarillion
            silm_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/hobbit-chapters':
            hobbit_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/fellowship-chapters': 
            fellowship_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/twotowers-chapters': 
            twotowers_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/return-chapters': 
            return_chapters.append(txt)

In [3]:
# Store chapter names for use in dataframes later

silm_chapter_names = ["Ainundalë", "Valaquenta", "Of the Beginning of Days", "Of Aulë and Yavanna" , "Of the Coming of the Elves and the Captivity of Melkor",\
                     "Of Thingol and Melian", "Of Eldamar and the Princes of the Eldalië", "Of Fëanor and the Unchaining of Melkor", "Of the Silmarils and the Unrest of the Noldor",\
                     "Of the Darkening of Valinor", "Of the Flight of the Noldor", "Of the Sindar", "Of the Sun and Moon and the Hiding of Valinor", "Of Men", "Of the Return of the Noldor",\
                     "Of Beleriad and its Realms", "Of the Noldor in Beleriad", "Of Maeglin", "Of the Coming of Men into the West", "Of the Ruin of Beleriad and the Fall of Fingolfin", "Of Beren and Lúthien",\
                     "Of the Fifth Battle: Nirnaeth Arnoediad", "Of Túrin Turambar", "Of the Ruin of Doriath", "Of Tuor and the Fall of Gondolin", "Of the Voyage of Eärendil and the War of Wrath", \
                     "Akallabêth: The Downfall of Númenor", "Of the Rings of Power and the Third Age"]

hobbit_chapter_names = ["An Unexpected Party", "Roast Mutton", "A Short Rest", "Over Hill and Under Hill", "Riddles In The Dark", \
                       "Out Of The Frying-Pan Into The Fire", "Queer Lodgings", "Flies And Spiders", "Barrels Out Of Bond", "A Warm Welcome", \
                       "On The Doorstep", "Inside Information", "Not At Home", "Fire And Water", "The Gathering Of The Clouds", "A Thief In The Night", \
                       "The Clouds Burst", "The Return Journey", "The Last Stage"]

fellowship_chapter_names = ["Concerning Hobbits", "Concerning Pipeweed", "Of the Ordering of the Shire", "Note on the Shire Records", "A Long-expected Party", "The Shadow of the Past", \
                           "Three is Company", "A Short Cut to Mushrooms", "A Conspiracy Unmasked", "The Old Forest", "In the House of Tom Bombadil", "Fog on the Barrow-downs", "At the Sign of the Prancing Pony",\
                           "Strider", "A Knife in the Dark", "Flight to the Ford", "Many Meetings", "The Council of Elrond", "The Ring goes South", "A Journey in the Dark", "The Bridge of Khazad-dûm", \
                           "Lothlórien", "The Mirror of Galadriel", "Farewell to Lórien", "The Great River", "The Breaking of the Fellowship"]

twotowers_chapter_names = ["The Departure of Boromir", "The Riders of Rohan", "The Uruk-hai", "Treebeard", "The White Rider", "The King of the Golden Hall", "Helm's Deep", "The Road to Isengard", "Flotsam and Jetsam", \
                          "The Voice of Saruman", "The Palantír", "The Taming of Smeagol", "The Passage of the Marshes", "The Black Gate is Closed", "Of Herbs and Stewed Rabbit", "The Window of the West", "The Forbidden Pool", \
                          "Journey to the Cross-roads", "The Stairs to Cirith Ungol", "Shelob's Lair", "The Choices of Master Samwise"]

return_chapter_names = ["Minas Tirith", "The Passing of the Grey Company", "The Muster of Rohan", "The Siege of Gondor", "The Ride of Rohirrim", "The Battle of the Pelennor Fields", "The Pyre of Denethor",\
                       "The Houses of Healing", "The Last Debate", "The Black Gate Opens", "The Tower of Cirith Ungol", "The Land of Shadow", "Mount Doom", "The Field of Cormallen", "The Steward and the King", \
                       "Many Partings", "Homeward Bound", "Scouring of the Shire", "The Grey Havens"]


# Sentiment Analysis

## **Steps**
</br>
### ** 1. Segment all chapters into page-sized objects    **    
</br>
### ** 2. Send all segments to Sentiment140 API by chapter    **   
</br>
### ** 3. Calculate polarity averages and polarity lists. **   
</br>
### ** 4. Store API polarity ratings and send export to csv to plot**    

In [4]:
"""
Function that segments given chapter into a page-sized (2940 character) segments to be sent to the API.

Parameters - chapter - chapter of a book to be broken into segments
           - cut - length that we segment the text with
       
Returns a list of (string) segments of the chapter.
"""
def Segmenter(chapter, cut):
    segments = []
    # start and end indices for segmenting the text
    start = 0
    end = cut
    while end < len(chapter) + cut:
        segments.append(chapter[start:end])
        start = end
        end = end + cut
    return segments #segments of input chapter

**This function allows us to split the chapters of each book into segments to send in our HTTP-Post JSON requests.   
</br>
I chose 2940 as the length because this is the exact number of characters per page (including spaces) in my copy of Fellowship of the Ring. **

In [5]:
# Lists of Lists of Lists storing all segments of all chapters for each book
# [[chapter1 segment 0-2500, chap1, segmenet 2500-5000]... [chapter2 segment0-2500, ...]...]
silm_segments = []
hobbit_segments = []
fellowship_segments = []
twotowers_segments = []
return_segments = []

# List containing the lists storing each books' chapters
list_of_books = [silm_chapters, hobbit_chapters, fellowship_chapters, twotowers_chapters, return_chapters]
# List allowing us to access the segment lists
list_of_segments = [silm_segments, hobbit_segments, fellowship_segments, twotowers_segments, return_segments]

for i in range(len(list_of_books)):
    for chapter in list_of_books[i]: # Segment entire chapter using Segmenter function, with 2940 character cuts
        list_of_segments[i].append(Segmenter(chapter,2940))


**In the cell above I create lists of lists for segments of each chapter of each corpus or book, and append to them using my "Segmenter"
function, storing them neatly like this will allow me to iteratively query the API server. **

In [6]:
"""
Function that sends segments of 1 chapter through the Sentiment140 API.
In order to do so, it creates and appends segments to a JSON file, then posts the JSON queries to the API server
using requests module (using an HTTP Post)

Parameters - chapter_segments - segments of an entire chapter of a book

Returns a list of polarities for segments of the chapter, as well as the polarity average for the chapter

Note: Maximum of 700,000 characters per API request, though this shouldn't be a problem
"""

def Polarity(chapter_segments): # segments of a single chapter
    request = {'data':[]}
    polarityList = []
    counter = 0
    for segment in chapter_segments: # Fill JSON
        request['data'].append({'text':segment})
    r = requests.post('http://www.sentiment140.com/api/bulkClassifyJson?appid=blaz_j1@denison.edu', json=request)
    jso = r.json()
    for i in range(len(request['data'])-1):
        polarityList.append(jso['data'][i]['polarity'])
    
    polarityTotal = 0
    for value in polarityList:
        polarityTotal = polarityTotal + value
    
    polarityAVG = polarityTotal/len(polarityList)
    #print(polarityList)
    return polarityList, polarityAVG

In [7]:
# This function takes about a minute to run

# store all averages, then store chapter avg, also overall average
silm_polarity_avg = []
hobbit_polarity_avg = []
fellowship_polarity_avg = []
twotowers_polarity_avg = []
return_polarity_avg = []

silm_polarity_lists = []
hobbit_polarity_lists = []
fellowship_polarity_lists = []
twotowers_polarity_lists = []
return_polarity_lists = []
### Need to get chapter names in

for x in range(len(list_of_books)):
    book = list_of_books[x]
    segs = list_of_segments[x]
    
    for i in range(len(book)):
        if x == 0:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            silm_polarity_lists.append(temp1)
            silm_polarity_avg.append(temp2)
        if x == 1:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            hobbit_polarity_lists.append(temp1)
            hobbit_polarity_avg.append(temp2)
        if x == 2:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            fellowship_polarity_lists.append(temp1)
            fellowship_polarity_avg.append(temp2)
        if x == 3:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            twotowers_polarity_lists.append(temp1)
            twotowers_polarity_avg.append(temp2)
        if x == 4:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            return_polarity_lists.append(temp1)
            return_polarity_avg.append(temp2)

all_polarity_avgs = [silm_polarity_avg, hobbit_polarity_avg, fellowship_polarity_avg, twotowers_polarity_avg, return_polarity_avg]

all_polarity_lists = [silm_polarity_lists, hobbit_polarity_lists, fellowship_polarity_lists, twotowers_polarity_lists, return_polarity_lists]  
# chapter 3 of return of the king is super dark

silm
hobbit
fellowship
two towers
return of the king


In [8]:
# Converting AVG data into pandas dataframes

silm_df = pd.DataFrame(silm_polarity_avg, index = silm_chapter_names, columns = ["Polarity"])
silm_df = silm_df.rename_axis("--- The Silmarillion ---")
#silm_df.to_csv("silm_df.csv")

hobbit_df = pd.DataFrame(hobbit_polarity_avg, index = hobbit_chapter_names, columns = ["Polarity"])
hobbit_df = hobbit_df.rename_axis("--- The Hobbit ---")
#hobbit_df.to_csv("hobbit_df.csv")

fellowship_df = pd.DataFrame(fellowship_polarity_avg, index = fellowship_chapter_names, columns = ["Polarity"])
fellowship_df = fellowship_df.rename_axis("--- The Fellowship of the Ring ---")
# Prologue chapters have weird polarities - have solid values because they're shorter
#fellowship_df.to_csv("fellowship_df.csv")

twotowers_df = pd.DataFrame(twotowers_polarity_avg, index = twotowers_chapter_names, columns = ["Polarity"])
twotowers_df = twotowers_df.rename_axis("--- The Two Towers ---")
#twotowers_df.to_csv("twotowers_df.csv")

return_df = pd.DataFrame(return_polarity_avg, index = return_chapter_names, columns = ["Polarity"])
return_df = return_df.rename_axis("--- The Return of the King ---")
#return_df.to_csv("return_df.csv")
# Really dark novel

# Dataframe of all Books overlaid
books_df = [silm_df, hobbit_df, fellowship_df, twotowers_df, return_df]
#full_df = pd.concat(books_df)

# Export to CSV
#full_df.to_csv("full_df.csv")

"""
# Export to Excel
pd.read_excel('file.xlsx')
full_df.to_csv("full_df.csv")
pd.to_excel('dir/myDataFrame.xlsx', sheet_name='Sheet1')
"""

# Add another column for topic, once topic modeling is complete

'\n# Export to Excel\npd.read_excel(\'file.xlsx\')\nfull_df.to_csv("full_df.csv")\npd.to_excel(\'dir/myDataFrame.xlsx\', sheet_name=\'Sheet1\')\n'

In [99]:
silm_pl_df = pd.DataFrame(silm_polarity_lists, index = silm_chapter_names)
#print(silm_pl_df)

print(silm_pl_df[2])
#silm_pl_df.to_csv("silm_pl_df_new.csv")


# Need help plotting this

Ainundalë                                                 0.0
Valaquenta                                                4.0
Of the Beginning of Days                                  4.0
Of Aulë and Yavanna                                       0.0
Of the Coming of the Elves and the Captivity of Melkor    4.0
Of Thingol and Melian                                     NaN
Of Eldamar and the Princes of the Eldalië                 4.0
Of Fëanor and the Unchaining of Melkor                    0.0
Of the Silmarils and the Unrest of the Noldor             0.0
Of the Darkening of Valinor                               0.0
Of the Flight of the Noldor                               0.0
Of the Sindar                                             0.0
Of the Sun and Moon and the Hiding of Valinor             4.0
Of Men                                                    NaN
Of the Return of the Noldor                               2.0
Of Beleriad and its Realms                                4.0
Of the N

In [10]:
"""
Function that tokenizes the words of every chapter in a book.

Parameters - book - a book.

Returns a List of Lists storing a tokenized list for every chapter in a book.
"""

def Tokenize(book):
    punctuation = ".,;!?:`'()"
    token_list = []
    for chapter in book:
        temp = []
        words = nltk.word_tokenize(chapter)
        for word in words:
            word = word.lower()
            if word not in punctuation and not word.isnumeric(): # remove punctuation
                temp.append(word)
        token_list.append(temp)
        
    return token_list


#tokens = (Tokenize(silm_chapters))
#print(tokens[0]) ## Tokens of first chapter of The Silmarillion

In [32]:
"""
Function that returns the n most common words for every chapter in a book.
This is accomplished by using 'Counter' in the 'Collections' module.

Parameter - book - a tokenized list of lists of all chapters of a book
          - n - number of most common words in the chapter
          
Returns a List of Lists of the n most common words of every chapter in the book.
"""

def MostCommon(book, n):
    from collections import Counter 
    stop_words = set(stopwords.words('english')) 
    
    names = ["gandalf", "merry", "pippin", "frodo", "sam", "aragorn", "faramir", "denethor", "gimli", "legolas"] # list of common character names
    
    tolkien_stop = ["men","great", "'s", "said", "went", "he", "would", "many", "one", "he", "came", "yet", "even", "shall", \
                   "upon", "days", "looked", "n't", "back", "could", "'ll", "'ve", "come", "still", "gate", "'i" ]
    # Have to get rid of a lot of words, I call these "tolkien stop words", the silmarillion is full of these,
    # in LOTR, 'great' and 'men' appear very often
    
    common_words = []
    for chapter in book:
        temp = []
        for word in chapter:
            if word not in stop_words and word not in names and word not in tolkien_stop:
                temp.append(word)
                  
        common_words.append(Counter(temp).most_common(10))
    return common_words

In [35]:
## Find top 5 NON-STOP words per chapter

silm_chapters_tokenized = []
hobbit_chapters_tokenized = []
fellowship_chapters_tokenized = []
twotowers_chapters_tokenized = []
return_chapters_tokenized = []


silm_chapters_common = []
hobbit_chapters_common = []
fellowship_chapters_common = []
twotowers_chapters_common = []
return_chapters_common = []



for i in range(len(list_of_books)):
    if i == 0:
        silm_chapters_tokenized = Tokenize(list_of_books[i])
        silm_chapters_common = MostCommon(silm_chapters_tokenized, 5)
    if i == 1:
        hobbit_chapters_tokenized = Tokenize(list_of_books[i])
        hobbit_chapters_common = MostCommon(hobbit_chapters_tokenized, 5)
    if i == 2:
        fellowship_chapters_tokenized = Tokenize(list_of_books[i])
        fellowship_chapters_common = MostCommon(fellowship_chapters_tokenized, 5)
    if i == 3:
        twotowers_chapters_tokenized = Tokenize(list_of_books[i])
        twotowers_chapters_common = MostCommon(twotowers_chapters_tokenized, 5)
    if i == 4:
        return_chapters_tokenized = Tokenize(list_of_books[i])
        return_chapters_common = MostCommon(return_chapters_tokenized, 5)


# Try leaving tolkien words in and show most common words across entire series

# Topic Modeling

LDA - Latent Dirichlet Allocation   
</br>
-- remove words that are %15 of the corpus

In [75]:
#import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import mglearn

In [62]:
# Trying out Topic Modeling on Return of the King first (entire text)

ret = open('return.txt', 'r')

vect = CountVectorizer(max_features=10000, max_df=.15)
return_of_the_king = vect.fit_transform(ret)
# Count Vectorizer removes all of the words that appear in at least 15% of the text

lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                max_iter=25, random_state=0)

document_topics = lda.fit_transform(ret)


In [105]:
## Topic modeling on certain chapters of LOTR
path= '/Users/blaz/Desktop/LOTR/return-chapters'

for file in sorted(glob.glob(os.path.join(path,'*.txt'))):
    if file == "/Users/blaz/Desktop/LOTR/return-chapters/13.txt":
        break
        
#print(file)

MountDoomFile = open("/Users/blaz/Desktop/LOTR/return-chapters/13.txt", 'r')

vect = CountVectorizer(max_features=10000, max_df=.15)
MountDoom = vect.fit_transform(MountDoomFile)
# Count Vectorizer removes all of the words that appear in at least 15% of the text

lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
                                max_iter=25, random_state=0)

document_topics = lda.fit_transform(MountDoom)


print("lda.components_.shape: {}".format(lda.components_.shape))
lda.components_.shape: (10, 10000)

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(4), feature_names=feature_names, 
                           sorting=sorting, topics_per_chunk=5, n_words=10)


lda.components_.shape: (10, 1618)
topic 0       topic 1       topic 2       topic 3       
--------      --------      --------      --------      
was           him           that          it            
on            with          it            said          
as            now           sam           with          
at            frodo         had           last          
all           at            for           its           
frodo         it            could         them          
with          in            on            sam           
sam           for           in            at            
for           but           as            my            
now           be            go            down          




# Testing

In [None]:
### Example JSON request
#NOTE: Sentiment140 Polarity values: 0: negative, 2: neutral, 4: positive

d = {'data':[{'text':'the titanic was ok'}, {'text':'this sucks'}]}
d['data'].append({'text':"Happy day!"})

r = requests.post('http://www.sentiment140.com/api/bulkClassifyJson?appid=blaz_j1@denison.edu', json=d)
js = r.json()

print(js['data'])

In [None]:
### Example of accessing the polarities

for i in range(len(d['data'])):
    print("Text:", js['data'][i]['text'], "\nPolarity:", js['data'][i]['polarity'])

In [None]:
## Segment list indexing examples

#print(hobbit_segments[15]) ## -- chapter
#print(hobbit_segments[0][0]) ## -- segments of chapter

In [None]:
#Working with 'glob'

path= '/Users/blaz/Desktop/LOTR/silmarillion-chapters'
silm_chapters = []

for file in sorted(glob.glob(os.path.join(path,'*.txt'))):
    print(file)
    f = open(file, 'r')
    txt = f.read()
    silm_chapters.append(txt)
    
print(len(silm_chapters))
print(silm_chapters[len(silm_chapters)-1])

In [None]:
# Using 'Counter'

biglist = []
for chapter in return_chapters_common:
    for word in chapter:
        biglist.append(word)
        
Counter(biglist).most_common(10)   