In [361]:
## Josh Blaz -- LOTR
import nltk
import re
import urllib.request
import lxml.html as lh
import io
import requests
import os
import glob

#NOTE: Sentiment140 Polarity values: 0: negative, 2: neutral, 4: positive

**NOTE: Elvish text is translated awkwardly into the .txt format    
</br>
IE:   
</br>
►M MPR -F+MTRX MP ft PPtK P&RMPht: P. t. The last Two runes are the initials of Thror and Thrain.**

** More Notes on the Corpuses:   
</br>
There are some slight discrepancies between the corpuses that I used and those that LOTRproject used.  
</br>
For whatever reason, the version of The Silmarillion that I used is missing the first two chapters of Book 1:
Ainundalë and Valaquenta  
</br>
Also, my copy of The Fellowship of the Ring is missing the prologue written by J.R.R Tolkien: "Concering Hobbit", 
"Concerning Pipeweed", "Of the Ordering of the Shire", and "Note on the Shire Records" **

In [390]:
# Used Chapterize to split books into chapters 
## https://github.com/JonathanReeve/chapterize
### The editions of the books I chose are slightly different than the editions chosen by LOTR project, the editions I 
### used have chapters that are merged together as sections of one another.

# These are lists containing strings of every chapter for each book
silm_chapters = []
hobbit_chapters = []
fellowship_chapters = []
twotowers_chapters = []
return_chapters = []

# Paths to directories storing book chapters
list_of_paths = ['/Users/blaz/Desktop/LOTR/silmarillion-chapters', '/Users/blaz/Desktop/LOTR/hobbit-chapters',\
                '/Users/blaz/Desktop/LOTR/fellowship-chapters', '/Users/blaz/Desktop/LOTR/twotowers-chapters',\
                '/Users/blaz/Desktop/LOTR/return-chapters']

for path in list_of_paths: # iterate through the list of folder paths for each book
    for file in sorted(glob.glob(os.path.join(path,'*.txt'))): # This gives us a sorted list of the files in each directory                                                         
        f = open(file, 'r') # open and read file               # allowing us to read in the chapters in order.
        txt = f.read()
    
        ## determine which path we're using and append it to the correct book chapter list
        if path == '/Users/blaz/Desktop/LOTR/silmarillion-chapters': 
            silm_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/hobbit-chapters':
            hobbit_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/fellowship-chapters': 
            fellowship_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/twotowers-chapters': 
            twotowers_chapters.append(txt)
        elif path == '/Users/blaz/Desktop/LOTR/return-chapters': 
            return_chapters.append(txt)

### 26            
print(len(fellowship_chapters))
print(fellowship_chapters[len(fellowship_chapters)-1])

22



Aragorn led them to the right arm of the River. Here upon its western 
side under the shadow of Tol Brandir a green lawn ran down to the water from 
the feet of Amon Hen. Behind it rose the first gentle slopes of the hill 
clad with trees, and trees marched away westward along the curving shores of 
the lake. A little spring fell tumbling down and fed the grass. 

'Here we will rest tonight,' said Aragorn. 'This is the lawn of Parth 
Galen: a fair place in the summer days of old. Let us hope that no evil has 
yet come here.' 

They drew up their boats on the green banks, and beside them they made 
their camp. They set a watch, but had no sight nor sound of their enemies. 

If Gollum had contrived to follow them, he remained unseen and unheard. 
Nonetheless as the night wore on Aragorn grew uneasy, tossing often in his 
sleep and waking. In the small hours he got up and came to Frodo, whose turn 
it was to watch. 

'Why are you waking? ' asked Frodo. 'It is not your watch.' 

'I d

# Sentiment Analysis

**Now, for each book, we have lists containing all of the chapters for that book.  
</br>
This was accomplished by iterating through the directories that store the chapters for each book using the "glob" and "os" modules.**

In [371]:
"""
Function that segments given chapter into a 2500 character segments to be sent to the API.

Params - chapter is a chapter of a book to be broken into segments

Returns a list of (string) segments of the chapter.
"""
def Segmenter(chapter, cut):
    segments = []
    # start and end indices for segmenting the text
    start = 0
    end = cut
    while end < len(chapter) + cut:
        segments.append(chapter[start:end])
        start = end
        end = end + cut
    
    return segments #segments of input chapter

**This function allows us to split the chapters of each book into segments to send in our HTTP-Post JSON requests.   
</br>
I chose 2940 as the length because this is the exact number of characters per page (including spaces) in my copy of Fellowship of the Ring. **

In [376]:
# Lists of Lists of Lists storing all segments of all chapters for each book
# [[chapter1 segment 0-2500, chap1, segmenet 2500-5000]... [chapter2 segment0-2500, ...]...]
silm_segments = []
hobbit_segments = []
fellowship_segments = []
twotowers_segments = []
return_segments = []

# List containing the lists storing each books' chapters
list_of_books = [silm_chapters, hobbit_chapters, fellowship_chapters, twotowers_chapters, return_chapters]
# List allowing us to access the segment lists
list_of_segments = [silm_segments, hobbit_segments, fellowship_segments, twotowers_segments, return_segments]

# Access the lists above so that we can iteratively segment each chapter in the corpus
for i in range(len(list_of_books)):
    for book in list_of_books:
        for chapter in list_of_books[i]:
            list_of_segments[i].append(Segmenter(chapter,2940))
            

**In the cell above I create lists of lists for segments of each chapter of each corpus or book, and append to them using my "Segmenter"
function, storing them neatly like this will allow me to iteratively query the API server. **

In [373]:
"""
Function that sends segments of 1 chapter through the Sentiment140 API.
In order to do so, it creates a JSON file and adds these segments to the file, then it sends an HTTP post to the 
API using the requests module.

Returns 

Note: Maximum of 700,000 characters per API request, though this shouldn't be a problem
"""

def Polarity(chapter_segments): # segments of a single chapter
    request = {'data':[]}
    polarityList = []
    counter = 0
    for segment in chapter_segments: # Fill JSON
        request['data'].append({'text':segment})
    r = requests.post('http://www.sentiment140.com/api/bulkClassifyJson?appid=blaz_j1@denison.edu', json=request)
    jso = r.json()
    for i in range(len(request['data'])-1):
        polarityList.append(jso['data'][i]['polarity'])
    
    polarityTotal = 0
    for value in polarityList:
        polarityTotal = polarityTotal + value
    
    polarityAVG = polarityTotal/len(polarityList)
    #print(polarityList)
    return polarityList, polarityAVG

In [374]:
# This function takes about a minute to run

# store all averages, then store chapter avg, also overall average
silm_polarity_avg = []
hobbit_polarity_avg = []
fellowship_polarity_avg = []
twotowers_polarity_avg = []
return_polarity_avg = []

silm_polarity_lists = []
hobbit_polarity_lists = []
fellowship_polarity_lists = []
twotowers_polarity_lists = []
return_polarity_lists = []
### Need to get chapter names in

for x in range(len(list_of_books)):
    book = list_of_books[x]
    segs = list_of_segments[x]
    for i in range(len(book)):
        if x == 0:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            silm_polarity_lists.append(temp1)
            silm_polarity_avg.append(temp2)
        if x == 1:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            hobbit_polarity_lists.append(temp1)
            hobbit_polarity_avg.append(temp2)
        if x == 2:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            fellowship_polarity_lists.append(temp1)
            fellowship_polarity_avg.append(temp2)
        if x == 3:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            twotowers_polarity_lists.append(temp1)
            twotowers_polarity_avg.append(temp2)
        if x == 4:
            temp1 = []
            temp2 = 0.0
            temp1,temp2 = Polarity(segs[i])
            return_polarity_lists.append(temp1)
            return_polarity_avg.append(temp2)
            
# chapter 3 of return of the king is super dark

In [380]:
print(return_polarity_lists)
print(return_polarity_avg)
for listy in return_polarity_lists:
    print(len(listy))

[[0, 0, 4, 4, 4, 0, 0, 0, 4, 0, 4, 0, 4, 4, 4, 0, 0, 4, 0, 0, 4, 4, 0], [0, 0, 4, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 4, 0, 0, 0, 0, 4, 0, 4, 4, 0], [4, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 2], [4, 0, 0, 0, 0, 4, 0, 4, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 4, 0, 0, 4, 4, 4, 0], [0, 0, 4, 0, 4, 0, 4, 4, 4, 4, 4, 4, 0], [4, 4, 2, 4, 2, 0, 2, 4, 0, 4, 0, 0, 4], [0, 4, 0, 0, 4, 2, 0], [0, 0, 2, 0, 0, 0, 4, 0, 2, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 0], [0, 0, 0, 4, 4, 2, 4, 4]]
[1.9130434782608696, 0.6666666666666666, 1.3333333333333333, 0.7619047619047619, 0.0, 0.0, 0.0, 2.0, 1.7777777777777777, 0.0, 0.47058823529411764, 0.13333333333333333, 0.0, 2.0, 2.4615384615384617, 2.3076923076923075, 1.4285714285714286, 0.7, 2.25]
23
1

# Topic Modeling

# Testing

In [350]:
### Example JSON request
#NOTE: Sentiment140 Polarity values: 0: negative, 2: neutral, 4: positive

d = {'data':[{'text':'the titanic was ok'}, {'text':'this sucks'}]}
d['data'].append({'text':"Happy day!"})

r = requests.post('http://www.sentiment140.com/api/bulkClassifyJson?appid=blaz_j1@denison.edu', json=d)
js = r.json()

print(js['data'])

[{'text': 'the titanic was ok', 'polarity': 2, 'meta': {'language': 'en'}}, {'text': 'this sucks', 'polarity': 0, 'meta': {'language': 'en'}}, {'text': 'Happy day!', 'polarity': 4, 'meta': {'language': 'en'}}]


In [351]:
### Example of accessing the polarities

for i in range(len(d['data'])):
    print("Text:", js['data'][i]['text'], "\nPolarity:", js['data'][i]['polarity'])

Text: the titanic was ok 
Polarity: 2
Text: this sucks 
Polarity: 0
Text: Happy day! 
Polarity: 4


In [None]:
## Segment list indexing examples

#print(hobbit_segments[15]) ## -- chapter
#print(hobbit_segments[0][0]) ## -- segments of chapter