# Sentiment analysis

#### Importing the libraries for sentiment analysis

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer # imports WordNetLemmatizer
from nltk.corpus import wordnet as wn # imports Wordnet corpus
from nltk.corpus import sentiwordnet as swn # imports SentiWordNet
from nltk.sentiment.vader import SentimentIntensityAnalyzer #imports the Vader module for analyzing sentiment



#### Importing the libraries for opening up the file and common processing tasks 

In [2]:
import nltk
from IPython.display import display
import string
from string import punctuation
import urllib2
import os
import re
import collections
from collections import OrderedDict

### Preprocessing the text

In [3]:
url = "https://www.gutenberg.org/files/140/140-0.txt" #assigns a link to a variable
data = urllib2.urlopen(url.encode('utf-8')).read() #opens up the link/file
data = data.replace("\r\n", " ")
data = re.sub('^(.*?)START OF THIS PROJECT GUTENBERG EBOOK', '', data) #removes Project Gutenberg metadata
data = re.sub('End of the Project Gutenberg(.*?)$', '', data)#removes Project Gutenberg metadata
data = unicode(data, errors="ignore") # ignores encoding errors
data[:10000]
#Alternative texts to use
#https://www.gutenberg.org/files/140/140-0.txt The Jungle by Upton Sinclair
# https://www.gutenberg.org/files/1342/1342-0.txt Pride and Prejudice by Jane Austen
#https://www.gutenberg.org/files/543/543-0.txt The Main Street by Sinclair Lewis

u' THE JUNGLE ***     Produced by David Meltzer, Christy Phillips, Scott Coulter, Leroy Smith and David Widger       THE JUNGLE  by Upton Sinclair   (1906)      Chapter 1   It was four oclock when the ceremony was over and the carriages began to arrive. There had been a crowd following all the way, owing to the exuberance of Marija Berczynskas. The occasion rested heavily upon Marijas broad shoulders--it was her task to see that all things went in due form, and after the best home traditions; and, flying wildly hither and thither, bowling every one out of the way, and scolding and exhorting all day with her tremendous voice, Marija was too eager to see that others conformed to the proprieties to consider them herself. She had left the church last of all, and, desiring to arrive first at the hall, had issued orders to the coachman to drive faster. When that personage had developed a will of his own in the matter, Marija had flung up the window of the carriage, and, leaning out, proceede

In [4]:
sentences = nltk.sent_tokenize(data) #sentence tokenization

In [5]:
sentences[0:150]

[u' THE JUNGLE ***     Produced by David Meltzer, Christy Phillips, Scott Coulter, Leroy Smith and David Widger       THE JUNGLE  by Upton Sinclair   (1906)      Chapter 1   It was four oclock when the ceremony was over and the carriages began to arrive.',
 u'There had been a crowd following all the way, owing to the exuberance of Marija Berczynskas.',
 u'The occasion rested heavily upon Marijas broad shoulders--it was her task to see that all things went in due form, and after the best home traditions; and, flying wildly hither and thither, bowling every one out of the way, and scolding and exhorting all day with her tremendous voice, Marija was too eager to see that others conformed to the proprieties to consider them herself.',
 u'She had left the church last of all, and, desiring to arrive first at the hall, had issued orders to the coachman to drive faster.',
 u'When that personage had developed a will of his own in the matter, Marija had flung up the window of the carriage, and, 

### Assigning sentiment scores to sentences

In [12]:
nltk.download("vader_lexicon")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/harish/nltk_data...


OSError: [Errno 28] No space left on device: '/home/harish/nltk_data/sentiment'

In [7]:
sid = SentimentIntensityAnalyzer() # calls SentimentIntensitAnalyzer function
for sentence in sentences[100:150]: # for a sentence in a list of sentences
        display (sentence) #display sentence
        ss = sid.polarity_scores(sentence) #and assign a sentiment polarity score
        for k in sorted(ss):        
            display ('{0}: {1}, '.format(k, ss[k])) #format the output

LookupError: 
**********************************************************************
  Resource
  u'sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt'
  not found.  Please use the NLTK Downloader to obtain the
  resource:  >>> nltk.download()
  Searched in:
    - '/home/harish/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - u''
**********************************************************************

### Going a step deeper with sentiment analysis 

In [13]:
# For a sentence that contains, for example, a particular named entity (e.g. location, person, organization) the following code establishes the polarity of adjectives ('JJ) or adverbs ('RB') in the sentence. 
lemmatizer = WordNetLemmatizer()
adj = [] # opens up an empty list to collect all the adjectives 
for i in sentences: # for each sentence in a list of sentences
    if 'Chicago' in i: #if the sentence contains a particular named entity
        for word, tag in nltk.pos_tag(nltk.word_tokenize(i)): #tokenize the sentence and assign part of speech for each word
            if tag=="JJ": #check if the word is an adjective or adverb "RB" or superlative "JJS", or comparative "JJR"
                try:
                    word_lemma = lemmatizer.lemmatize(word) # lemmatize the word
                    print (word_lemma)
                    adj.append(str(word)) #add it to the adj list
                    synset=list(swn.senti_synsets(word_lemma, "a"))[0] #get the most likely synset for that adjective, superlative, comparative or adverb and its sentiment scoring
                    print (synset)
                except:
                    pass

much
<much.a.01: PosScore=0.0 NegScore=0.0>
far-off
<faraway.s.01: PosScore=0.0 NegScore=0.0>
rear
<rear.s.01: PosScore=0.0 NegScore=0.0>
hungry
<hungry.a.01: PosScore=0.125 NegScore=0.125>
hard
<difficult.a.01: PosScore=0.0 NegScore=0.75>
green
<green.s.01: PosScore=0.0 NegScore=0.125>
snow-clad
<snow-clad.s.01: PosScore=0.0 NegScore=0.25>
flesh
important
<important.a.01: PosScore=0.875 NegScore=0.0>
second
<second.s.01: PosScore=0.0 NegScore=0.0>
rich
<rich.a.01: PosScore=0.0 NegScore=0.0>
big
<large.a.01: PosScore=0.25 NegScore=0.125>
black
<black.a.01: PosScore=0.0 NegScore=0.375>
unable
<unable.a.01: PosScore=0.0 NegScore=0.75>
perplexed
<perplexed.a.01: PosScore=0.0 NegScore=0.75>
good
<good.a.01: PosScore=0.75 NegScore=0.0>
informal
<informal.a.01: PosScore=0.0 NegScore=0.25>
cursory
<casual.s.05: PosScore=0.125 NegScore=0.75>
southern
<southern.a.01: PosScore=0.0 NegScore=0.0>
great
<great.s.01: PosScore=0.0 NegScore=0.0>
open
<open.a.01: PosScore=0.0 NegScore=0.375>
wide
<wide

In [None]:
adj

#### Extracting most common adjectives or adverbs that are found in the vicinity of a particular named entity

In [None]:
adj_di = collections.Counter(adj) #counting all the extracted adjectives and putting them in a dictionary structure
adj_di.most_common()

#### WordNet

In [None]:
wn.synsets('chicago')

In [None]:
print(wn.synset('chicago.n.01').definition())

In [None]:
chicago = wn.synset('chicago.n.01')

In [None]:
chicago.hypernyms()