In [1]:

import re
import pandas as pd
import numpy as np
from collections import defaultdict
from nltk import tokenize


from nltk.sentiment.vader import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt
import matplotlib
plt.style.use('fivethirtyeight')
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from tqdm import tqdm_notebook as tqdm
from tqdm import trange


import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import re


pd.set_option('display.max_rows', 500)



In [5]:
sentences = ["VADER is smart, handsome, and funny.",  # positive sentence example
             "VADER is smart, handsome, and funny!",  # punctuation emphasis handled correctly (sentiment intensity adjusted)
             "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted)
             "VADER is VERY SMART, handsome, and FUNNY.",  # emphasis for ALLCAPS handled
             "VADER is VERY SMART, handsome, and FUNNY!!!", # combination of signals - VADER appropriately adjusts intensity
             "VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!", # booster words & punctuation make this close to ceiling for score
             "VADER is not smart, handsome, nor funny.",  # negation sentence example
             "The book was good.",  # positive sentence
             "At least it isn't a horrible book.",  # negated negative sentence with contraction
             "The book was only kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
             "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
             "Today SUX!",  # negative slang with capitalization emphasis
             "Today only kinda sux! But I'll get by, lol", # mixed sentiment example with slang and constrastive conjunction "but"
             "Make sure you :) or :D today!",  # emoticons handled
             "Catch utf-8 emoji such as such as 💘 and 💋 and 😁",  # emojis handled
             "Not bad at all"  # Capitalized negation
             ]

analyzer = SentimentIntensityAnalyzer()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<65} {}".format(sentence, str(vs)))

VADER is smart, handsome, and funny.----------------------------- {'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.8316}
VADER is smart, handsome, and funny!----------------------------- {'neg': 0.0, 'neu': 0.248, 'pos': 0.752, 'compound': 0.8439}
VADER is very smart, handsome, and funny.------------------------ {'neg': 0.0, 'neu': 0.299, 'pos': 0.701, 'compound': 0.8545}
VADER is VERY SMART, handsome, and FUNNY.------------------------ {'neg': 0.0, 'neu': 0.246, 'pos': 0.754, 'compound': 0.9227}
VADER is VERY SMART, handsome, and FUNNY!!!---------------------- {'neg': 0.0, 'neu': 0.233, 'pos': 0.767, 'compound': 0.9342}
VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!--------- {'neg': 0.0, 'neu': 0.294, 'pos': 0.706, 'compound': 0.9469}
VADER is not smart, handsome, nor funny.------------------------- {'neg': 0.646, 'neu': 0.354, 'pos': 0.0, 'compound': -0.7424}
The book was good.----------------------------------------------- {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'co

In [8]:
def getcorpus(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    words = soup.get_text()
    words= re.sub("[^a-zA-Z' ]+", " ", words)
    return words



HP2 = getcorpus("http://www.pauladaunt.com/books/Children's/Harry_Potter1-4-1/J.%20K.%20Rowling%20-%20Harry%20Potter%202%20-%20The%20Chamber%20Of%20Secrets.txt")


HP3 = getcorpus("http://www.pauladaunt.com/books/Children's/Harry_Potter1-4/J.%20K.%20Rowling%20-%20Harry%20Potter%203%20-%20Prisoner%20of%20Azkaban.txt")


HP4 = getcorpus("https://cdn.preterhuman.net/texts/literature/books_by_title/N%20-%20S/Rowlings%20Goblet%20of%20Fire.txt")


HP5 = getcorpus("https://raw.githubusercontent.com/bobdeng/owlreader/master/ERead/assets/books/Harry%20Potter%20and%20the%20Order%20of%20the%20Phoenix.txt")

HP6 = getcorpus("https://github.com/bobdeng/owlreader/blob/master/ERead/assets/books/Harry%20Potter%20and%20The%20Half-Blood%20Prince.txt")

HP7 = getcorpus("https://raw.githubusercontent.com/bobdeng/owlreader/master/ERead/assets/books/Harry%20Potter%20and%20the%20Deathly%20Hallows%20.txt")



In [35]:
########## process individual weirdness
HP1 = getcorpus("https://github.com/formcept/whiteboard/blob/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt")
HP1 =HP1.replace('Page     Harry Potter and the Philosophers Stone   J K  Rowling', '') 


HP1 = HP1[1904:]
HP1 =HP1.replace('Page     Harry Potter and the Philosophers Stone   J K  Rowling', '') 


In [128]:
def getcorpus_local(local):
    with open(str(local)+'.txt') as f:
        soup = BeautifulSoup(f, "html.parser")
        words = soup.get_text()
        #words= re.sub("[^a-zA-Z' ]+", " ", words)
        words = re.sub("\\n","",words)
        words =re.sub("\\'", "", words)
        return words
    
    
HPOne = getcorpus_local("Harry Potter and the Sorcerer")

In [130]:
chapternumbers = re.findall(r"(?<=CHAPTER )(\w+)", HPOne)
chapternumbers

['ONE',
 'TWO',
 'THREE',
 'FOUR',
 'FIVE',
 'SIX',
 'SEVEN',
 'EIGHT',
 'NINE',
 'TEN',
 'ELEVEN',
 'TWELVE',
 'THIRTEEN',
 'FOURTEEN',
 'FIFTEEN',
 'SIXTEEN',
 'SEVENTEEN']

In [131]:
chapternames = re.findall(r"\sCHAPTER\s\w+..((?:[A-Z]+(?:\s+|$)){1,8})", HPOne)
chapternames

['HE BOY WHO LIVED ',
 'HE VANISHING GLASS ',
 'HE LETTERS FROM NO ONE ',
 'HE KEEPER OF THE KEYS ',
 'IAGON ALLEY ',
 'HE JOURNEY FROM PLATFORM NINE AND ',
 'HE SORTING HAT ',
 'HE POTIONS MASTER ',
 'HE MIDNIGHT DUEL ',
 'ALLOWEEN ',
 'UIDDITCH ',
 'HE MIRROR OF ERISED ',
 'ICOLAS FLAMEL ',
 'ORBERT THE NORWEGIAN RIDGEBACK ',
 'HE FORIBIDDEN FOREST ',
 'HROUGH THE TRAPDOOR ',
 'HE MAN WITH TWO FACES ']

In [132]:
chaptertext = HPOne.split("CHAPTER")


In [133]:
chaptertextformatted = []
for num in list(range(len(chapternames))):
    intext =chaptertext[num+1][1:len(chapternumbers[num] + str("  ")+ chapternames[num].rstrip())+1]
    #print(intext)
    chaptertextformatted.append(chaptertext[num+1].split(intext)[1])


In [135]:
for thing in chaptertextformatted:
    print( "Sentance One :", thing[0:100])

Sentance One :  Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly norm
Sentance One :  Nearly ten years had passed since the Dursleys had woken up to find their nephew on the front step,
Sentance One :  The escape of the Brazilian boa constrictor earned Harry his longest-ever punishment. By the time h
Sentance One :  BOOM. They knocked again. Dudley jerked awake. "Wheres the cannon?" he said stupidly. There was a c
Sentance One :  Harry woke early the next morning. Although he could tell it was daylight, he kept his eyes shut ti
Sentance One :  THREE-QUARTERS Harrys last month with the Dursleys wasnt fun. True, Dudley was now so scared of Har
Sentance One :  The door swung open at once. A tall, black-haired witch in emerald-green robes stood there. She had
Sentance One :  There, look." "Where?" "Next to the tall kid with the red hair." "Wearing the glasses?" "Did you se
Sentance One :  Harry had never believed he would meet a boy he hated mo

In [136]:
def listOfTuples(l1, l2): 
    return list(map(lambda x, y:(x,y), l1, l2))


namestuple = listOfTuples(chapternames, chaptertextformatted)

In [137]:
chapternamelistformatted = []
for num  in list(range(len(chapternames))):
    chapternamelistformatted.append( 'CHAPTER '+ chapternumbers[num])

In [138]:
BookOne =dict(zip(chapternamelistformatted, namestuple))

BookOne["CHAPTER TWO"][1]



In [66]:
analyzer = SentimentIntensityAnalyzer()

In [139]:
for chapternum in BookOne:
    #print(chapternum, BookOne[chapternum][1][0:100])
    text =BookOne[chapternum][1]
    sentence_list = tokenize.sent_tokenize(text)
    sentiments = {'compound': 0.0, 'neg': 0.0, 'neu': 0.0, 'pos': 0.0}
    print(sentence_list)

[' Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.', 'They were the last people youd expect to be involved in anything strange or mysterious, because they just didnt hold with such nonsense.', 'Mr. Dursley was the director of a firm called Grunnings, which made drills.', 'He was a big, beefy man with hardly any neck, although he did have a very large mustache.', 'Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.', 'The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.', 'The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it.', 'They didnt think they could bear it if anyone found out about the Potters.', 'Mrs. Potter was Mrs. Dursleys sister, but they hadnt

[' THREE-QUARTERS Harrys last month with the Dursleys wasnt fun.', 'True, Dudley was now so scared of Harry he wouldnt stay in the same room, while Aunt Petunia and Uncle Vernon didnt shut Harry in his cupboard, force him to do anything, or shout at him -- in fact, they didnt speak to him at all.', 'Half terrified, half furious, they acted as though any chair with Harry in it were empty.', 'Although this was an improvement in many ways, it did become a bit depressing after a while.', 'Harry kept to his room, with his new owl for company.', 'He had decided to call her Hedwig, a name he had found in A History of Magic.', 'His school books were very interesting.', 'He lay on his bed reading late into the night, Hedwig swooping in and out of the open window as she pleased.', 'It was lucky that Aunt Petunia didnt come in to vacuum anymore, because Hedwig kept bringing back dead mice.', 'Every night before he went to sleep, Harry ticked off another day on the piece of paper he had pinned to 

[' Christmas was coming.', 'One morning in mid-December, Hogwarts woke to find itself covered in several feet of snow.', 'The lake froze solid and the Weasley twins were punished for bewitching several snowballs so that they followed Quirrell around, bouncing off the back of his turban.', 'The few owls that managed to battle their way through the stormy sky to deliver mail had to be nursed back to health by Hagrid before they could fly off again.', 'No one could wait for the holidays to start.', 'While the Gryffindor common room and the Great Hall had roaring fires, the drafty corridors had become icy and a bitter wind rattled the windows in the classrooms.', 'Worst of all were Professor Snapes classes down in the dungeons, where their breath rose in a mist before them and they kept as close as possible to their hot cauldrons.', '"I do feel so sorry," said Draco Malfoy, one Potions class, "for all those people who have to stay at Hogwarts for Christmas because theyre not wanted at home



In [98]:
for chapter in BookOne:
    #print(chapter)
    #print(BookOne[chapter][1])
    text= BookOne[chapter][1]
    sentence_list = tokenize.sent_tokenize(text)
    sentiments = {'compound': 0.0, 'neg': 0.0, 'neu': 0.0, 'pos': 0.0}
        
    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        sentiments['compound'] += vs['compound']
        sentiments['neg'] += vs['neg']
        sentiments['neu'] += vs['neu']
        sentiments['pos'] += vs['pos']

    sentiments['compound'] = sentiments['compound'] / len(sentence_list)
    sentiments['neg'] = sentiments['neg'] / len(sentence_list)
    sentiments['neu'] = sentiments['neu'] / len(sentence_list)
    sentiments['pos'] = sentiments['pos'] / len(sentence_list)

    BookOne[chapter] = (BookOne[chapter][0], BookOne[chapter][1], sentiments)



In [109]:
for chapter in BookOne:
    print(chapter)

CHAPTER ONE
CHAPTER TWO
CHAPTER THREE
CHAPTER FOUR
CHAPTER FIVE
CHAPTER SIX
CHAPTER SEVEN
CHAPTER EIGHT
CHAPTER NINE
CHAPTER TEN
CHAPTER ELEVEN
CHAPTER TWELVE
CHAPTER THIRTEEN
CHAPTER FOURTEEN
CHAPTER FIFTEEN
CHAPTER SIXTEEN
CHAPTER SEVENTEEN


In [74]:
compound_sentiments = [BookOne[chapter][2]['compound'] for chapter in BookOne]

In [91]:

for chapter in BookOne:
    print(chapter)
    print(BookOne[chapter][1])

CHAPTER ONE
  Mr  and Mrs  Dursley  of number four  Privet Drive  were proud to say that they were perfectly normal  thank you very much  They were the last people you'd expect to be involved in anything strange or mysterious  because they just didn't hold with such nonsense   Mr  Dursley was the director of a firm called Grunnings  which made drills  He was a big  beefy man with hardly any neck  although he did have a very large mustache  Mrs  Dursley was thin and blonde and had nearly twice the usual amount of neck  which came in very useful as she spent so much of her time craning over garden fences  spying on the neighbors  The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere   The Dursleys had everything they wanted  but they also had a secret  and their greatest fear was that somebody would discover it  They didn't think they could bear it if anyone found out about the Potters  Mrs  Potter was Mrs  Dursley's sister  but they hadn't met f

In [89]:
for chapter in tqdm(BookOne, postfix=book):
        print('  ', BookOne[chapter])
        text = BookOne[chapter][1].replace('\n', '')
        sentence_list = tokenize.sent_tokenize(text)
        sentiments = {'compound': 0.0, 'neg': 0.0, 'neu': 0.0, 'pos': 0.0}

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))

   ('THE BOY WHO LIVED  ', "  Mr  and Mrs  Dursley  of number four  Privet Drive  were proud to say that they were perfectly normal  thank you very much  They were the last people you'd expect to be involved in anything strange or mysterious  because they just didn't hold with such nonsense   Mr  Dursley was the director of a firm called Grunnings  which made drills  He was a big  beefy man with hardly any neck  although he did have a very large mustache  Mrs  Dursley was thin and blonde and had nearly twice the usual amount of neck  which came in very useful as she spent so much of her time craning over garden fences  spying on the neighbors  The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere   The Dursleys had everything they wanted  but they also had a secret  and their greatest fear was that somebody would discover it  They didn't think they could bear it if anyone found out about the Potters  Mrs  Potter was Mrs  Dursley's sister  but t

"  Mr  and Mrs  Dursley  of number four  Privet Drive  were proud to say that they were perfectly normal  thank you very much  They were the last people you'd expect to be involved in anything strange or mysterious  because they just didn't hold with such nonsense   Mr  Dursley was the director of a firm called Grunnings  which made drills  He was a big  beefy man with hardly any neck  although he did have a very large mustache  Mrs  Dursley was thin and blonde and had nearly twice the usual amount of neck  which came in very useful as she spent so much of her time craning over garden fences  spying on the neighbors  The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere   The Dursleys had everything they wanted  but they also had a secret  and their greatest fear was that somebody would discover it  They didn't think they could bear it if anyone found out about the Potters  Mrs  Potter was Mrs  Dursley's sister  but they hadn't met for several 