In [2]:
# This notebook shows an example of sentiment analyzer using Vader sentiment analyzer package.
#This package can be downloaded from nltk or from VaderSentiment packages



In [3]:
# This works with both python 2.0 and upwards
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jade/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




In [4]:
# This works with python 3.0
# You can also download from vaderSentiment package
#! pip install vaderSentiment
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [5]:
sentences = ["VADER is smart, handsome, and funny.","VADER is smart, handsome, and funny!", 
"VADER is very smart, handsome, and funny.",  
"VADER is VERY SMART, handsome, and FUNNY.",  
"VADER is VERY SMART, handsome, and FUNNY!!!",
"VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!", 
"The book was good.",         
"The book was kind of good.", 
"The plot was good, but the characters are uncompelling and the dialog is not great.", 
"A really bad, horrible book.",       
"At least it isn't a horrible book.",
":) and :D",     
"",              
"Today sux",    
"Today sux!",    
"Today SUX!",    
"Today kinda sux! But I'll get by, lol"]

In [6]:
len(sentences)

17

In [7]:
sid = SentimentIntensityAnalyzer()
for sentence in sentences:
    print(sentence)
    
    ss = sid.polarity_scores(sentence)
    
    for keys,values in ss.items():
        print(keys)
        print(values)
    

VADER is smart, handsome, and funny.
neg
0.0
neu
0.254
pos
0.746
compound
0.8316
VADER is smart, handsome, and funny!
neg
0.0
neu
0.248
pos
0.752
compound
0.8439
VADER is very smart, handsome, and funny.
neg
0.0
neu
0.299
pos
0.701
compound
0.8545
VADER is VERY SMART, handsome, and FUNNY.
neg
0.0
neu
0.246
pos
0.754
compound
0.9227
VADER is VERY SMART, handsome, and FUNNY!!!
neg
0.0
neu
0.233
pos
0.767
compound
0.9342
VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!
neg
0.0
neu
0.294
pos
0.706
compound
0.9469
The book was good.
neg
0.0
neu
0.508
pos
0.492
compound
0.4404
The book was kind of good.
neg
0.0
neu
0.657
pos
0.343
compound
0.3832
The plot was good, but the characters are uncompelling and the dialog is not great.
neg
0.327
neu
0.579
pos
0.094
compound
-0.7042
A really bad, horrible book.
neg
0.791
neu
0.209
pos
0.0
compound
-0.8211
At least it isn't a horrible book.
neg
0.0
neu
0.637
pos
0.363
compound
0.431
:) and :D
neg
0.0
neu
0.124
pos
0.876
compound
0.7925



In [8]:
# The code below shows how to extract the particular value from a dictionary

In [9]:
sentences[0]

'VADER is smart, handsome, and funny.'

In [10]:
output=sid.polarity_scores(sentences[0])

In [11]:
print(output)

{'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.8316}


In [12]:
print(output['compound'])

0.8316


In [13]:
# Another format of printing

sid = SentimentIntensityAnalyzer()
for sentence in sentences:
    print(sentence)
    
    ss = sid.polarity_scores(sentence)
# This is for formatting purposes
    for k in sorted(ss):
            print('{0}: {1}, '.format(k, ss[k]))
# This print is for an extra blank line
    print()
    

VADER is smart, handsome, and funny.
compound: 0.8316, 
neg: 0.0, 
neu: 0.254, 
pos: 0.746, 

VADER is smart, handsome, and funny!
compound: 0.8439, 
neg: 0.0, 
neu: 0.248, 
pos: 0.752, 

VADER is very smart, handsome, and funny.
compound: 0.8545, 
neg: 0.0, 
neu: 0.299, 
pos: 0.701, 

VADER is VERY SMART, handsome, and FUNNY.
compound: 0.9227, 
neg: 0.0, 
neu: 0.246, 
pos: 0.754, 

VADER is VERY SMART, handsome, and FUNNY!!!
compound: 0.9342, 
neg: 0.0, 
neu: 0.233, 
pos: 0.767, 

VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!
compound: 0.9469, 
neg: 0.0, 
neu: 0.294, 
pos: 0.706, 

The book was good.
compound: 0.4404, 
neg: 0.0, 
neu: 0.508, 
pos: 0.492, 

The book was kind of good.
compound: 0.3832, 
neg: 0.0, 
neu: 0.657, 
pos: 0.343, 

The plot was good, but the characters are uncompelling and the dialog is not great.
compound: -0.7042, 
neg: 0.327, 
neu: 0.579, 
pos: 0.094, 

A really bad, horrible book.
compound: -0.8211, 
neg: 0.791, 
neu: 0.209, 
pos: 0.0, 

At 

In [14]:
sorted(ss)

['compound', 'neg', 'neu', 'pos']

The pos, neu, and neg scores are ratios for proportions of text that fall in each category (so these should all add up to be 1... or close to it with float operation). 
These are the most useful metrics if you want multidimensional measures of sentiment for a given sentence.

The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the 
rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). 
This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence.
Calling it a 'normalized, weighted composite score' is accurate.

In [15]:
# We can set standardized thresholds for classifying sentences as either positive, neutral, or negative. 
# Typical threshold values (used in the literature cited on nltk page) are:

# positive sentiment: compound score >= 0.05
# neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
# negative sentiment: compound score <= -0.05

In [16]:
# Now we need to store all results in a table
#First initialize the list for all scores and tweets. 
# These are all empty lists
tweet = [] # This is for tweets
vs_compound = []# This is for compound score
vs_pos = [] # This is for positive score
vs_neu = [] # This is for neutral score
vs_neg = [] # This is for negativescore
vs_sentiment =[] # This is for compound score

In [17]:
for i in range(0, len(sentences)):
    tweet.append(sentences[i])
    vs_compound.append(sid.polarity_scores(sentences[i])['compound'])
    vs_pos.append(sid.polarity_scores(sentences[i])['pos'])
    vs_neu.append(sid.polarity_scores(sentences[i])['neu'])
    vs_neg.append(sid.polarity_scores(sentences[i])['neg'])
    vs_sentiment.append(sentences[i])

In [18]:
vs_compound

[0.8316,
 0.8439,
 0.8545,
 0.9227,
 0.9342,
 0.9469,
 0.4404,
 0.3832,
 -0.7042,
 -0.8211,
 0.431,
 0.7925,
 0.0,
 -0.3612,
 -0.4199,
 -0.5461,
 0.2228]

In [19]:
len(tweet)

17

In [20]:
len(vs_compound)

17

In [21]:
len(vs_pos)

17

In [22]:
from pandas import Series, DataFrame

twitter_df = DataFrame({'Tweet': tweet, 
                       'Compound': vs_compound,
                       'Positive': vs_pos,
                       'Neutral': vs_neu,
                        'Negative': vs_neg,})

In [23]:
twitter_df

Unnamed: 0,Tweet,Compound,Positive,Neutral,Negative
0,"VADER is smart, handsome, and funny.",0.8316,0.746,0.254,0.0
1,"VADER is smart, handsome, and funny!",0.8439,0.752,0.248,0.0
2,"VADER is very smart, handsome, and funny.",0.8545,0.701,0.299,0.0
3,"VADER is VERY SMART, handsome, and FUNNY.",0.9227,0.754,0.246,0.0
4,"VADER is VERY SMART, handsome, and FUNNY!!!",0.9342,0.767,0.233,0.0
5,"VADER is VERY SMART, really handsome, and INCR...",0.9469,0.706,0.294,0.0
6,The book was good.,0.4404,0.492,0.508,0.0
7,The book was kind of good.,0.3832,0.343,0.657,0.0
8,"The plot was good, but the characters are unco...",-0.7042,0.094,0.579,0.327
9,"A really bad, horrible book.",-0.8211,0.0,0.209,0.791


In [24]:
# Create a list to store the data
sentiment=[]
# For each row in the column,
for row in twitter_df['Compound']:
    if row>0.05:
        sentiment.append('positive')
    elif row>-0.05:
        sentiment.append('neutral')
    else: 
        sentiment.append('negative')

In [25]:
sentiment

['positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'neutral',
 'negative',
 'negative',
 'negative',
 'positive']

In [26]:
# Adding sentiment column to the list
twitter_df['sentiment']=sentiment

In [37]:
twitter_df

Unnamed: 0,Tweet,Compound,Positive,Neutral,Negative,sentiment
0,"VADER is smart, handsome, and funny.",0.8316,0.746,0.254,0.0,positive
1,"VADER is smart, handsome, and funny!",0.8439,0.752,0.248,0.0,positive
2,"VADER is very smart, handsome, and funny.",0.8545,0.701,0.299,0.0,positive
3,"VADER is VERY SMART, handsome, and FUNNY.",0.9227,0.754,0.246,0.0,positive
4,"VADER is VERY SMART, handsome, and FUNNY!!!",0.9342,0.767,0.233,0.0,positive
5,"VADER is VERY SMART, really handsome, and INCR...",0.9469,0.706,0.294,0.0,positive
6,The book was good.,0.4404,0.492,0.508,0.0,positive
7,The book was kind of good.,0.3832,0.343,0.657,0.0,positive
8,"The plot was good, but the characters are unco...",-0.7042,0.094,0.579,0.327,negative
9,"A really bad, horrible book.",-0.8211,0.0,0.209,0.791,negative


In [38]:
from __future__ import print_function
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import metrics
import pandas as pd
import numpy as np

In [39]:
# Reducing weight of more common words
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
bow_transformer=CountVectorizer().fit(twitter_df['Tweet'])
print (len(bow_transformer.vocabulary_))

36


In [40]:
# Count the entire sms corpus are a large, sparse matrix
twitter_bow=bow_transformer.transform(twitter_df['Tweet'])

In [84]:
# Separating out columns
twitter_df_score=twitter_df[['Positive','Neutral','Negative']]


In [85]:
# Separating out columns
twitter_df_label=twitter_df['sentiment']

In [86]:
# Checking work
twitter_df_label

0     positive
1     positive
2     positive
3     positive
4     positive
5     positive
6     positive
7     positive
8     negative
9     negative
10    positive
11    positive
12     neutral
13    negative
14    negative
15    negative
16    positive
Name: sentiment, dtype: object

In [87]:
# Checking work
twitter_df_score

Unnamed: 0,Positive,Neutral,Negative
0,0.746,0.254,0.0
1,0.752,0.248,0.0
2,0.701,0.299,0.0
3,0.754,0.246,0.0
4,0.767,0.233,0.0
5,0.706,0.294,0.0
6,0.492,0.508,0.0
7,0.343,0.657,0.0
8,0.094,0.579,0.327
9,0.0,0.209,0.791


In [88]:
# logistic regression
from sklearn.linear_model import LogisticRegression 

from sklearn.model_selection import train_test_split



In [89]:
# Split data into train and test
X_train, X_test, Y_train, Y_Test=train_test_split(twitter_df_score, twitter_df_label, test_size=0.3, random_state=0)

In [90]:
# Construct the classifier
logreg=LogisticRegression()

In [91]:
# Fit the classifier
logreg.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [94]:
y_pred=logreg.predict(X_test)

In [95]:
print('Accuracy of logistic regression classifieron test sets   ')
print(logreg.score(X_test, Y_Test))

Accuracy of logistic regression classifieron test sets   
0.5


In [99]:
from sklearn.metrics import classification_report
print(classification_report(Y_Test, y_pred))

             precision    recall  f1-score   support

   negative       0.00      0.00      0.00         3
   positive       0.50      1.00      0.67         3

avg / total       0.25      0.50      0.33         6



  'precision', 'predicted', average, warn_for)


In [103]:
print(Y_Test)

1     positive
6     positive
8     negative
9     negative
13    negative
4     positive
Name: sentiment, dtype: object


In [104]:
print(Y_train)

2     positive
14    negative
10    positive
7     positive
16    positive
11    positive
3     positive
0     positive
5     positive
15    negative
12     neutral
Name: sentiment, dtype: object


In [100]:
# Use Naïve Bayes and logistic classifier to classify the sentences based on sentiment scores
from sklearn.naive_bayes import MultinomialNB

In [102]:
nbclf = MultinomialNB()
nbclf.fit(X_train, Y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [106]:
# Use SVM classifier. Vary the parameter C in this model and plot the accuracies for different C values.
from sklearn.svm import SVC, LinearSVC

In [108]:
# fit the model
clf=SVC(kernel='linear', C=1000)
clf.fit(X_train, Y_train)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [109]:
y_pred1=clf.predict(X_test)

In [110]:
print(y_pred1)

['positive' 'positive' 'positive' 'negative' 'negative' 'positive']


In [None]:
# Compare the accuracies from logistic, NaiveBayes and SVM by plotting a bar graph.