In [1]:
import pandas as pd

### Sentiment Prediction Strategy 1: Word based scoring 
AFINN is a list of English words rated for valence with an integer
between minus five (negative) and plus five (positive). The words have
been manually labeled by Finn Årup Nielsen in 2009-2011. The file
is tab-separated. There are two versions:

AFINN-111: Newest version with 2477 words and phrases.

AFINN-96: 1468 unique words and phrases on 1480 lines. Note that there
are 1480 lines, as some words are listed twice. The word list in not
entirely in alphabetic ordering.  

An evaluation of the word list is available in:

Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903

Valence, as used in psychology, especially in discussing emotions, means the intrinsic attractiveness/"good"-ness (positive valence) or averseness/"bad"-ness (negative valence) of an event, object, or situation. ... For example, emotions popularly referred to as "negative", such as anger and fear, have negative valence.
    - source Wikipedia

In [1]:
pwd

'C:\\Users\\Administrator\\Desktop\\Python\\NLP\\Day 3'

In [2]:
affin = pd.read_csv( 'AFINN-111.txt' , sep = '\t'  , header = None ) # AFINN-111

In [3]:
affin.head()

Unnamed: 0,0,1
0,abandon,-2
1,abandoned,-2
2,abandons,-2
3,abducted,-2
4,abduction,-2


In [4]:
affin.columns = ['words' , 'score']

In [5]:
affin.head()

Unnamed: 0,words,score
0,abandon,-2
1,abandoned,-2
2,abandons,-2
3,abducted,-2
4,abduction,-2


In [6]:
affin['wordlen'] = affin['words'].apply( lambda x : len(x.split()) )

In [9]:
affin.query( 'score == 0 ')

Unnamed: 0,words,score,wordlen
2073,some kind,0,2


In [3]:
# pip install textblob 

In [5]:
from textblob import TextBlob

In [15]:
term_score = dict( affin[['words','score']].values )
# term_score

In [18]:
term_score['abandon']

-2

In [22]:
term_score.get( 'abandom' , 0)

0

In [20]:
print ( term_score.get( 'good' , 0) )
print ( term_score.get( 'not' , 0) )

3
0


In [30]:
affin.query( 'wordlen > 1')

Unnamed: 0,words,score,wordlen
339,can't stand,-3,2
353,cashing in,-2,2
500,cool stuff,3,2
761,does not work,-3,3
763,dont like,-2,2
978,fed up,-3,2
1108,green wash,-3,2
1109,green washing,-3,2
1516,messing up,-2,2
1605,no fun,-3,2


In [44]:
twin_words = affin.query( 'wordlen > 1')['words'].tolist()
# twin_words = list( enumerate( twin_words ))
print( twin_words )

["can't stand", 'cashing in', 'cool stuff', 'does not work', 'dont like', 'fed up', 'green wash', 'green washing', 'messing up', 'no fun', 'not good', 'not working', 'right direction', 'screwed up', 'some kind']


In [42]:
twin_words_u =  {  tuple(words.split()) : ind  for ind , words  in twin_words }
print( twin_words_u )


{("can't", 'stand'): 0, ('cashing', 'in'): 1, ('cool', 'stuff'): 2, ('does', 'not', 'work'): 3, ('dont', 'like'): 4, ('fed', 'up'): 5, ('green', 'wash'): 6, ('green', 'washing'): 7, ('messing', 'up'): 8, ('no', 'fun'): 9, ('not', 'good'): 10, ('not', 'working'): 11, ('right', 'direction'): 12, ('screwed', 'up'): 13, ('some', 'kind'): 14}


In [43]:
twin_words_u.keys()

dict_keys([("can't", 'stand'), ('cashing', 'in'), ('cool', 'stuff'), ('does', 'not', 'work'), ('dont', 'like'), ('fed', 'up'), ('green', 'wash'), ('green', 'washing'), ('messing', 'up'), ('no', 'fun'), ('not', 'good'), ('not', 'working'), ('right', 'direction'), ('screwed', 'up'), ('some', 'kind')])

In [45]:
print( twin_words )

["can't stand", 'cashing in', 'cool stuff', 'does not work', 'dont like', 'fed up', 'green wash', 'green washing', 'messing up', 'no fun', 'not good', 'not working', 'right direction', 'screwed up', 'some kind']


In [49]:
txt.replace( 'not good' , '_''not good'.split() )

'nlp is n'

In [66]:
t = 'does not good'
for i in ['does not work', 'dont like']:
    if i in t :
        print( True )
    else :
        print( False )

False
False


In [53]:
txt = 'nlp is not good'

sum(  [ term_score.get(i,0) for i in txt.split() ]  ) # score for each word getting added up

3

In [69]:
txt = 'nlp is does not work :-)) '

for i in twin_words:
    if i in txt :
        txt = txt.replace( i , '_'.join(i.split()) )


print(txt)

print( sum( [term_score.get(i.replace('_',' ') , 0 ) for i in txt.split()] ) )




nlp is does_not_work :-)) 
-3


In [35]:
import nltk

In [61]:
from nltk.tokenize import word_tokenize 

In [62]:
def get_sentiment( txt ):
    d = {}
    score = 0 
    tokens = word_tokenize( txt.lower() )
    for terms in tokens:
        score += term_score.get( terms , 0 )
        d[terms] = term_score.get( terms , 0 )
    return score , d

In [100]:
print ( term_score.get( 'not good' , 0) )

-2


In [101]:
txt = 'The hotel is not good and the food is also pathetic'
s , d = get_sentiment(txt)
print(d)
print(s)

{'the': 0, 'hotel': 0, 'is': 0, 'not': 0, 'good': 3, 'and': 0, 'food': 0, 'also': 0, 'pathetic': -2}
1


# Sentiment Prediction Strategy 2 : vader 

In [56]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [57]:
# VADER ( Valence Aware Dictionary and sEntiment Reasoner )

In [58]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [59]:
analyser = SentimentIntensityAnalyzer()

In [60]:
# Polarity_scores methof on the analyser instance gives the :
    
#     1 ) probability of the positive , negative , neutral sentiments
#     2 ) For each of the probabilities it adds a weitage to arrive at a compound score
    
#     For instance the weitage for the -v2 probability can be some negative number 

In [7]:
analysis = TextBlob('the food is great')


In [8]:
analysis.sentiment

Sentiment(polarity=0.8, subjectivity=0.75)

In [10]:
analysis.sentiment_assessments?

In [69]:
print( analyser.polarity_scores( 'the food is great' ) )
print( analyser.polarity_scores( 'the food is great!!' ) )
print( analyser.polarity_scores( 'the food is great :-)' ) )

{'neg': 0.0, 'neu': 0.423, 'pos': 0.577, 'compound': 0.6249}
{'neg': 0.0, 'neu': 0.39, 'pos': 0.61, 'compound': 0.6892}
{'neg': 0.0, 'neu': 0.319, 'pos': 0.681, 'compound': 0.7506}


In [70]:
# It is very sensitive to the smileys and punctuations 

In [89]:
print( analyser.polarity_scores( 'the food is not great' ) )
print( analyser.polarity_scores( 'the food is not great !!!!!!' ) )
print( analyser.polarity_scores( 'the food is not great :-(' ) )
print()
print( analyser.polarity_scores( 'the food is terrible' ) )
print( analyser.polarity_scores( 'the food is terrible !!!!!!' ) )
print( analyser.polarity_scores( 'the food is terrible :-(' ) )
print()
print( analyser.polarity_scores( 'the food is TERRIBLE' ) )

{'neg': 0.452, 'neu': 0.548, 'pos': 0.0, 'compound': -0.5096}
{'neg': 0.472, 'neu': 0.528, 'pos': 0.0, 'compound': -0.6664}
{'neg': 0.35, 'neu': 0.425, 'pos': 0.224, 'compound': -0.2924}

{'neg': 0.508, 'neu': 0.492, 'pos': 0.0, 'compound': -0.4767}
{'neg': 0.516, 'neu': 0.484, 'pos': 0.0, 'compound': -0.6449}
{'neg': 0.651, 'neu': 0.349, 'pos': 0.0, 'compound': -0.6808}

{'neg': 0.561, 'neu': 0.439, 'pos': 0.0, 'compound': -0.5904}


In [90]:
print( analyser.polarity_scores( 'just heard the news' ) )
print( analyser.polarity_scores( 'just heard the news :-)' ) )
print( analyser.polarity_scores( 'just heard the news :-(' ) )
print()
print( analyser.polarity_scores( 'just heard the news smh' ) )
print( analyser.polarity_scores( 'just heard the news <3' ) )

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 0.635, 'pos': 0.365, 'compound': 0.3182}
{'neg': 0.385, 'neu': 0.615, 'pos': 0.0, 'compound': -0.3612}

{'neg': 0.365, 'neu': 0.635, 'pos': 0.0, 'compound': -0.3182}
{'neg': 0.0, 'neu': 0.58, 'pos': 0.42, 'compound': 0.4404}


In [87]:
( 0.365 - 0.3182 ) / 0.635

0.07370078740157482

In [88]:
( 0.3612 - 0.385 ) / 0.615

-0.0386991869918699

In [96]:
print( analyser.polarity_scores( 'just heard the news <3' )['compound'] )

0.4404


In [97]:
def get_vander_sentiment(sent) : 
    return analyser.polarity_scores(sent)['compound']

In [98]:
get_vander_sentiment( 'this food is pathetic but the hotel is amazing')

0.5927