In [867]:
%matplotlib inline
import pandas as pd
import numpy as np
tweets_raw = pd.read_csv("data/Sentiment.csv")

In [868]:
sentiments = tweets_raw.loc[:,['sentiment', 'text']]
sentiments_pos = sentiments[sentiments.sentiment == 'Positive']
sentiments_neg = sentiments[sentiments.sentiment == 'Negative']

import re
nltk.download('stopwords')
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

def word_feats(tweet_full):
    label = tweet_full[0]
    words = re.findall(r"[\w']+|[.,!?;]", tweet_full[1])
    words_filtered = [e.lower() for e in words if (len(e) >= 3) & (e.lower() not in stopset)]
    
    bigram_finder = BigramCollocationFinder.from_words(words_filtered)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 3)
    return (dict([(ngram, True) for ngram in itertools.chain(words_filtered, bigrams)]), label)

def word_feats_without_bigrams(tweet_full):
    label = tweet_full[0]
    words = re.findall(r"[\w']+|[.,!?;]", tweet_full[1])
    words_filtered = [e.lower() for e in words if (len(e) >= 3) & (e.lower() not in stopset)]
    return (dict([(word, True) for word in words_filtered]), label)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jgzuke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [869]:
neg_features = [word_feats(tweet_full) for tweet_full in sentiments_neg.values]
pos_features = [word_feats(tweet_full) for tweet_full in sentiments_pos.values]
sentiments_test = neg_features[:400] + pos_features[:400]
sentiments_train = neg_features[400:2236] + pos_features[400:2236]
print (len(neg_features))
print (len(pos_features))

8493
2236


In [870]:
import nltk
from nltk import NaiveBayesClassifier
clf = NaiveBayesClassifier.train(sentiments_train)
clf.show_most_informative_features()

Most Informative Features
              gopdebates = True           Positi : Negati =    337.0 : 1.0
('gopdebate', 'gopdebates') = True           Positi : Negati =     87.8 : 1.0
            rwsurfergirl = True           Positi : Negati =     68.6 : 1.0
       ('bush', 'rubio') = True           Positi : Negati =     48.3 : 1.0
                    band = True           Positi : Negati =     48.3 : 1.0
          donniewahlberg = True           Positi : Negati =     37.7 : 1.0
                   nails = True           Positi : Negati =     33.0 : 1.0
                together = True           Positi : Negati =     29.8 : 1.0
       ('cruz', 'trump') = True           Positi : Negati =     29.4 : 1.0
                 ratings = True           Positi : Negati =     24.4 : 1.0


In [871]:
import collections
reference_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)
for i, features in enumerate(sentiments_test):
    label = features[1]
    reference_sets[label].add(i)
    predicted = clf.classify(features[0])
    test_sets[predicted].add(i)

In [872]:
from nltk.metrics.scores import precision
from nltk.metrics.scores import recall
accuracy = nltk.classify.util.accuracy(clf, sentiments_test)
pos_precision = nltk.precision(reference_sets['Positive'], test_sets['Positive'])
pos_recall = nltk.recall(reference_sets['Positive'], test_sets['Positive'])
neg_precision = nltk.precision(reference_sets['Negative'], test_sets['Negative'])
neg_recall = nltk.recall(reference_sets['Negative'], test_sets['Negative'])

In [873]:
print ("accuracy: " + str(accuracy))
print ("pos_precision: " + str(pos_precision))
print ("pos_recall: " + str(pos_recall))
print ("neg_precision: " + str(neg_precision))
print ("neg_recall: " + str(neg_recall))

accuracy: 0.72375
pos_precision: 0.8231046931407943
pos_recall: 0.57
neg_precision: 0.6711281070745698
neg_recall: 0.8775
