In [993]:
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk import NaiveBayesClassifier
tweets_raw = pd.read_csv("data/Sentiment.csv")

In [994]:
sentiments = tweets_raw.loc[:,['sentiment', 'text']]
sentiments_pos = sentiments[sentiments.sentiment == 'Positive']
sentiments_neg = sentiments[sentiments.sentiment == 'Negative']

import re
nltk.download('stopwords')
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

pos_words = []
neg_words = []

def word_feats(tweet_full):
    label = tweet_full[0]
    words = re.findall(r"[\w']+|[.,!?;]", tweet_full[1])
    words_filtered = [e.lower() for e in words if (len(e) >= 3) & (e.lower() not in stopset)]
    
    if (label == 'Positive'):
        pos_words.extend(words_filtered)
    else:
        neg_words.extend(words_filtered)
    
    bigram_finder = BigramCollocationFinder.from_words(words_filtered)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 3)
    return (dict([(ngram, True) for ngram in itertools.chain(words_filtered, bigrams)]), label)

def word_feats_without_bigrams(tweet_full):
    label = tweet_full[0]
    words = re.findall(r"[\w']+|[.,!?;]", tweet_full[1])
    words_filtered = [e.lower() for e in words if (len(e) >= 3) & (e.lower() not in stopset)]
    return (dict([(word, True) for word in words_filtered]), label)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jgzuke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [995]:
neg_features = [word_feats(tweet_full) for tweet_full in sentiments_neg.values]
pos_features = [word_feats(tweet_full) for tweet_full in sentiments_pos.values]
sentiments_test = neg_features[:400] + pos_features[:400]
sentiments_train = neg_features[400:] + pos_features[400:]
print (len(neg_features))
print (len(pos_features))

8493
2236


In [996]:
import collections
from nltk.metrics.scores import precision
from nltk.metrics.scores import recall
def train_and_test(train, test):
    clf = NaiveBayesClassifier.train(train)
    print (clf.show_most_informative_features())
    
    reference_sets = collections.defaultdict(set)
    test_sets = collections.defaultdict(set)
    for i, features in enumerate(test):
        label = features[1]
        reference_sets[label].add(i)
        predicted = clf.classify(features[0])
        test_sets[predicted].add(i)

    print ("accuracy: " + str(nltk.classify.util.accuracy(clf, sentiments_test)))
    print ("pos_precision: " + str(nltk.precision(reference_sets['Positive'], test_sets['Positive'])))
    print ("pos_recall: " + str(nltk.recall(reference_sets['Positive'], test_sets['Positive'])))
    print ("neg_precision: " + str(nltk.precision(reference_sets['Negative'], test_sets['Negative'])))
    print ("neg_recall: " + str(nltk.recall(reference_sets['Negative'], test_sets['Negative'])))

In [997]:
train_and_test(sentiments_train, sentiments_test)

Most Informative Features
          donniewahlberg = True           Positi : Negati =     99.6 : 1.0
('brought', 'gopdebate') = True           Positi : Negati =     74.9 : 1.0
               lrihendry = True           Positi : Negati =     73.9 : 1.0
                libertyu = True           Positi : Negati =     54.3 : 1.0
  ('candidates', "i've") = True           Positi : Negati =     54.3 : 1.0
  ('favorite', 'things') = True           Positi : Negati =     54.3 : 1.0
           wilberforce91 = True           Positi : Negati =     51.4 : 1.0
            kimguilfoyle = True           Positi : Negati =     42.6 : 1.0
                 forward = True           Positi : Negati =     40.2 : 1.0
               favorites = True           Positi : Negati =     39.7 : 1.0
None
accuracy: 0.74875
pos_precision: 0.7696476964769647
pos_recall: 0.71
neg_precision: 0.7308584686774942
neg_recall: 0.7875


In [998]:
from nltk.probability import FreqDist, ConditionalFreqDist
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
 
for word in pos_words:
    word_fd[word] += 1
    label_word_fd['Positive'][word] += 1

for word in neg_words:
    word_fd[word] += 1
    label_word_fd['Negative'][word] += 1

pos_word_count = label_word_fd['Positive'].N()
neg_word_count = label_word_fd['Negative'].N()
total_word_count = pos_word_count + neg_word_count
 
word_scores = {}
 
for word in word_fd:
    freq = word_fd[word]
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['Positive'][word], (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['Negative'][word], (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

bestwords = sorted(word_scores, key=word_scores.get, reverse=True)[:2500]

def best_word_feats(tweet_full):
    label = tweet_full[0]
    words = re.findall(r"[\w']+|[.,!?;]", tweet_full[1])
    words_all = [e.lower() for e in words]
    words_filtered = [e for e in words_all if (len(e) >= 3) & (e not in stopset) & (e in bestwords)]
    
    if (label == 'Positive'):
        pos_words.extend(words_filtered)
    else:
        neg_words.extend(words_filtered)
    
    bigram_finder = BigramCollocationFinder.from_words(words_all)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 3)
    return (dict([(ngram, True) for ngram in itertools.chain(words_filtered, bigrams)]), label)

neg_features = [best_word_feats(tweet_full) for tweet_full in sentiments_neg.values]
pos_features = [best_word_feats(tweet_full) for tweet_full in sentiments_pos.values]
sentiments_test = neg_features[:400] + pos_features[:400]
sentiments_train = neg_features[400:] + pos_features[400:]

train_and_test(sentiments_train, sentiments_test)

Most Informative Features
           ('and', 'am') = True           Positi : Negati =    174.8 : 1.0
          donniewahlberg = True           Positi : Negati =     99.6 : 1.0
       ('be', 'brought') = True           Positi : Negati =     74.9 : 1.0
               lrihendry = True           Positi : Negati =     73.9 : 1.0
         ('.', 'thanks') = True           Positi : Negati =     54.3 : 1.0
                libertyu = True           Positi : Negati =     54.3 : 1.0
           wilberforce91 = True           Positi : Negati =     51.4 : 1.0
            kimguilfoyle = True           Positi : Negati =     42.6 : 1.0
     ('as', 'president') = True           Positi : Negati =     41.3 : 1.0
                 forward = True           Positi : Negati =     40.2 : 1.0
None
accuracy: 0.7925
pos_precision: 0.8046875
pos_recall: 0.7725
neg_precision: 0.78125
neg_recall: 0.8125
