# Sentiment Analysis 
### Lexicon based SA analysis
I tried out some things, and in the end you will find the calculation of a sentiment score of a sentence. Did this using the sentiwordnet lexicon, but maybe other lexicons are nice to use too? The stuff here can easily be applied to our liar dataset. 

In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

In [2]:
# Download the sentiwordnet data (only needed once...)
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/joycedenhertog/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [3]:
print(list(swn.senti_synsets('slow')))

[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [4]:
breakdown = swn.senti_synset('breakdown.n.03')
print(breakdown)

<breakdown.n.03: PosScore=0.0 NegScore=0.25>


In [5]:
breakdown.pos_score()

0.0

In [6]:
print(swn.senti_synsets('breakdown'))
for synset in swn.senti_synsets('breakdown'):
    print(synset)

<filter object at 0x1a195e8f60>
<dislocation.n.02: PosScore=0.0 NegScore=0.0>
<breakdown.n.02: PosScore=0.125 NegScore=0.5>
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
<breakdown.n.04: PosScore=0.0 NegScore=0.0>


In [7]:
def senti_score(word):
    synsets =  swn.senti_synsets(word)
    positive = 0
    negative = 0
    total_synsets = 0
    
    for synset in synsets:
        positive = positive + synset.pos_score()
        negative = negative + synset.neg_score()
        total_synsets += 1 
        
    positive = positive / total_synsets
    negative = negative / total_synsets
    return (positive, negative)

In [8]:
for s in swn.senti_synsets(","):
    print(s)

In [9]:
senti_score('hello')

(0.0, 0.0)

In [10]:
sentence = ["this", "is", "a", "sad", "sentence", "sad", "sad"]

In [11]:
def senti_score_sentence(sentence):
    positive = 0
    negative = 0
    length = 0
    
    for word in sentence:
        if not wn.synsets(word): # if it doesn't have synsets you can't calculate the sentiment score..
            continue
        positive = positive + senti_score(word)[0]
        negative = negative + senti_score(word)[1]
        length += 1
    
    positive = positive / length
    negative = negative / length

    return (positive, negative)

In [12]:
senti_score_sentence(sentence)

(0.028617216117216113, 0.34249084249084244)

# SA of the Liar Dataset
Here, we're applying the sentiment analysis that we tested above to the Liar dataset, by calculating a positive and a negative score for each of the statements. We're saving those results in a new csv file. 

In [13]:
import pandas as pd

# create df of all the data
df_liar = pd.read_csv("train.tsv", encoding="utf8", sep="\t", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context"])

df_liar.head(3)

Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver


In [14]:
def pos_senti_score_sentence(sentence):
    positive = 0
    length = 0
    
    for word in sentence:
        if not wn.synsets(word): # if it doesn't have synsets you can't calculate the sentiment score..
            continue
        positive = positive + senti_score(word)[0]
        length += 1
    
    positive = positive / length

    return positive

In [17]:
def neg_senti_score_sentence(sentence):
    negative = 0
    length = 0
    
    for word in sentence:
        if not wn.synsets(word): # if it doesn't have synsets you can't calculate the sentiment score..
            continue
        negative = negative + senti_score(word)[1]
        length += 1
    
    negative = negative / length

    return negative

In [18]:
df_liar["pos-sentiment"] = df_liar["text"].apply(pos_senti_score_sentence) 

In [19]:
df_liar["neg-sentiment"] = df_liar["text"].apply(neg_senti_score_sentence) 

In [20]:
# let's see whether it worked
df_liar.head(3)

Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context,pos-sentiment,neg-sentiment
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,0.007972,0.012908
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,0.011481,0.014223
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,0.009258,0.011969


In [21]:
# saving the results in a new csv file
df_liar.to_csv("sentiment_train.csv",index=False)

In [24]:
# now for the training data 
df_liar_test = pd.read_csv("test.tsv", encoding="utf8", sep="\t", names=["id", "truth-value", 
                                                                     "text", "topic", "name", "job", 
                                                                     "state", "politics", "count1", "count2", 
                                                                     "count3", "count4", "count5", "context"])
df_liar_test.head(3)

Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context
0,11972.json,True,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview
1,11685.json,False,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference
2,11096.json,False,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.


In [25]:
df_liar_test["pos-sentiment"] = df_liar["text"].apply(pos_senti_score_sentence) 

In [26]:
df_liar_test["neg-sentiment"] = df_liar["text"].apply(neg_senti_score_sentence) 

In [27]:
# let's see whether it worked
df_liar_test.head(3)

Unnamed: 0,id,truth-value,text,topic,name,job,state,politics,count1,count2,count3,count4,count5,context,pos-sentiment,neg-sentiment
0,11972.json,True,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview,0.007972,0.012908
1,11685.json,False,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference,0.011481,0.014223
2,11096.json,False,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.,0.009258,0.011969


In [29]:
# saving the results in a new csv file
df_liar_test.to_csv("sentiment_test.csv",index=False)