In [364]:
%matplotlib inline
import pandas as pd
import numpy as np
tweets = pd.read_csv("data/Sentiment.csv")

In [365]:
tweets_clean = tweets[(tweets.relevant_yn_confidence > 0.8) & (tweets.candidate_confidence > 0.8) & (tweets.sentiment_confidence > 0.8) & (tweets.subject_matter_confidence > 0.8)]
tweets_clean = tweets_clean.drop(['id', 'candidate_confidence', 'relevant_yn', 'relevant_yn_confidence', 'sentiment_confidence', 'subject_matter_confidence', 'candidate_gold', 'relevant_yn_gold', 'sentiment_gold', 'subject_matter_gold', 'tweet_coord', 'tweet_id', 'user_timezone'], axis=1)
tweets_clean.head()

Unnamed: 0,candidate,sentiment,subject_matter,name,retweet_count,text,tweet_created,tweet_location
7,No candidate mentioned,Neutral,None of the above,RaulAReyes,0,Going on #MSNBC Live with @ThomasARoberts arou...,2015-08-07 09:54:44 -0700,New York NY
10,Donald Trump,Negative,None of the above,jnjsmom,0,@JGreenDC @realDonaldTrump In all fairness #Bi...,2015-08-07 09:54:42 -0700,"Peoria, IL"
14,Scott Walker,Positive,None of the above,In_Related_News,215,RT @pattonoswalt: I loved Scott Walker as Mark...,2015-08-07 09:54:42 -0700,"San Diego, California"
20,Ted Cruz,Positive,None of the above,rickymcghee,6,"RT @ChuckNellis: Cruz has class &amp; truth, t...",2015-08-07 09:54:39 -0700,"fripp island,sc/ southeast ga"
26,Scott Walker,Negative,Abortion,mch7576,19,RT @TheBaxterBean: Scott Walker's Abortion Ban...,2015-08-07 09:54:38 -0700,USA


In [366]:
print (tweets_clean.columns)

Index(['candidate', 'sentiment', 'subject_matter', 'name', 'retweet_count',
       'text', 'tweet_created', 'tweet_location'],
      dtype='object')


In [367]:
candidates = tweets_clean['candidate'].unique()
candidates = [x for x in candidates if ((x != 'No candidate mentioned') & (len(tweets_clean[tweets_clean.candidate == x]) > 50))]

In [368]:
import seaborn as sb
candidate_info = pd.DataFrame(columns=['percent_positive'], index=candidates)
for candidate in candidates:
    sentiments = tweets_clean[(tweets_clean.candidate == candidate)]['sentiment']
    pos_count = len(sentiments[sentiments == 'Positive'])
    neg_count = len(sentiments[sentiments == 'Negative'])
    candidate_info.loc[candidate,'percent_positive'] = pos_count / (pos_count + neg_count)
candidate_info

Unnamed: 0,percent_positive
Donald Trump,0.211091
Scott Walker,0.123077
Ted Cruz,0.695035
Jeb Bush,0.028169
John Kasich,0.76087
Chris Christie,0.151515
Ben Carson,0.640449
Rand Paul,0.263158
Mike Huckabee,0.314286
Marco Rubio,0.565217


## Sentiment

In [369]:
sentiments = tweets.loc[:,['sentiment', 'text']]
sentiments = sentiments[sentiments.sentiment != 'Neutral']
sentiments_test = sentiments[:1500]
sentiments = sentiments[1500:]

tweets_full = sentiments.values
tweets = []
for tweet_full in tweets_full:
    words_filtered = [e.lower() for e in tweet_full[1].split() if len(e) >= 3]
    tweets.append((words_filtered, tweet_full[0]))

In [370]:
import nltk
from nltk import NaiveBayesClassifier
all_words = []
for (words, label) in tweets:
    all_words.extend(words)
word_frequency = nltk.FreqDist(all_words)
word_list = word_frequency.keys()

In [371]:
def features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in word_list:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

In [372]:
training_set = nltk.classify.apply_features(features, tweets)

In [373]:
clf = NaiveBayesClassifier.train(training_set)

In [377]:
print (clf.show_most_informative_features(5))

Most Informative Features
         contains(next.) = True           Positi : Negati =    154.7 : 1.0
contains(@donniewahlberg:) = True           Positi : Negati =    139.4 : 1.0
   contains(@lrihendry:) = True           Positi : Negati =     82.7 : 1.0
     contains(@libertyu) = True           Positi : Negati =     47.3 : 1.0
contains(@wilberforce91:) = True           Positi : Negati =     44.7 : 1.0
None


In [375]:
tweets_full_test = sentiments_test.values
tweets_test = []
for tweet_full in tweets_full_test:
    words_filtered = [e.lower() for e in tweet_full[1].split() if len(e) >= 3]
    tweets_test.append((words_filtered, tweet_full[0]))
    
test_set = nltk.classify.apply_features(features, tweets_test)

In [391]:
import collections
reference_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)
for i, (features, label) in enumerate(test_set):
    reference_sets[label].add(i)
    predicted = classifier.classify(features)
    test_sets[predicted].add(i)

In [382]:
import nltk.metrics
accuracy = nltk.classify.util.accuracy(classifier, test_set)

AttributeError: module 'nltk.translate.metrics' has no attribute 'precision'

In [395]:
from nltk.metrics.scores import precision
from nltk.metrics.scores import recall
pos_precision = nltk.precision(reference_sets['Positive'], test_sets['Positive'])
pos_recall = nltk.recall(reference_sets['Positive'], test_sets['Positive'])
neg_precision = nltk.precision(reference_sets['Negative'], test_sets['Negative'])
neg_recall = nltk.recall(reference_sets['Negative'], test_sets['Negative'])

In [399]:
print ("accuracy: " + str(accuracy))
print ("pos_precision: " + str(pos_precision))
print ("pos_recall: " + str(pos_recall))
print ("neg_precision: " + str(neg_precision))
print ("neg_recall: " + str(neg_recall))

accuracy: 0.7746666666666666
pos_precision: 0.391304347826087
pos_recall: 0.054878048780487805
neg_precision: 0.7867950481430537
neg_recall: 0.9761092150170648
