# Twitter Sentiment Analysis [Natural Language Processing (NLP)]

In [1]:
#Sentiment Analysis is also referred as Opinion Mining. It’s mostly used in social media and customer reviews data.
#In this article, we will use the NLTK’s twitter_samples corpus as our labeled training data. 
#The twitter_samples corpus contains 2K movie reviews with sentiment polarity classification.
#The twitter_samples corpus contains 3 files.
#1) negative_tweets.json: contains 5k negative tweets
#2) positive_tweets.json: contains 5k positive tweets
#3) tweets.20150430-223406.json: contains 20k posi tive and negative tweets"""

In [2]:
import nltk
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
print (twitter_samples.fileids())

pos_tweets = twitter_samples.strings('positive_tweets.json')
print (len(pos_tweets)) # Output: 5000
neg_tweets = twitter_samples.strings('negative_tweets.json')
print (len(neg_tweets)) # Output: 5000
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
print (len(all_tweets)) # Output: 20000
for tweet in pos_tweets[:5]:
 print (tweet)

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
5000
5000
20000
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
@97sides CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days


# Tokenize Tweets

In [3]:
#NLTK has a TweetTokenizer module that does a good job in tokenizing (splitting text into a list of words) tweets.
#Three different parameters can be passed while calling the TweetTokenizer class. They are:
#preserve_case: if False then it converts tweet to lowercase and vice-versa.
#strip_handles: if True then it removes twitter handles from the tweet and vice-versa.
#reduce_len: if True then it reduces the length of words in the tweet like hurrayyyy, yipppiieeee,etc. and vice-versa.

In [4]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
for tweet in pos_tweets[:5]:
   print (tweet_tokenizer.tokenize(tweet))

['#followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hey', 'james', '!', 'how', 'odd', ':/', 'please', 'call', 'our', 'contact', 'centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'many', 'thanks', '!']
['we', 'had', 'a', 'listen', 'last', 'night', ':)', 'as', 'you', 'bleed', 'is', 'an', 'amazing', 'track', '.', 'when', 'are', 'you', 'in', 'scotland', '?', '!']
['congrats', ':)']
['yeaaah', 'yipppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days']


# Cleaning Tweet

In [5]:
#define a function named clean_tweets which does following...
#– Remove stock market tickers like $GE
#– Remove retweet text “RT”
#– Remove hyperlinks
#– Remove hashtags ( only the hashtag # and not the word)
#– Remove stop words like a, and, the, is, are, etc.
#– Remove emoticons like :), :D, :(, :-), etc.
#– Remove punctuation like full-stop, comm a, exclamation sign, etc.
#– Convert words to Stem/Base words using Porter Stemming Algori thm. E.g. words like ‘working’,
#‘works’, and ‘worked’ will be converted to their base/stem word “work”."""

In [6]:
import nltk
nltk.download("stopwords")
import string
import re
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
from nltk.tokenize import TweetTokenizer


def clean_tweets(tweet):
# remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
# remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
# remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
# remove hashtags
# only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
# tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
                 #word not in emoticons and # remove emoticons
                 word not in string.punctuation): # remove punctuation
#tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
    return tweets_clean


custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
# print cleaned tweet
print (clean_tweets(custom_tweet))
print (pos_tweets[5])
print (clean_tweets(pos_tweets[5]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['hello', 'great', 'day', ':)', 'good', 'morn']
@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM
['one', 'irresist', ':)', 'flipkartfashionfriday']


# Feature Extraction

In [7]:
#We define a simple bag_of_words function that extracts unigram features from the tweets.

In [8]:
# feature extractor function
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)
    return words_dictionary
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
print (bag_of_words(custom_tweet))

# positive tweets feature set
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))
    
# negative tweets feature set
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
print (len(pos_tweets_set), len(neg_tweets_set)) # Output: (5000, 5000)

{'hello': True, 'great': True, 'day': True, ':)': True, 'good': True, 'morn': True}
5000 5000


# Create Train and Test Set

In [9]:
#There are 5000 positive tweets set and 5000 negative tweets set. We take 20% (i.e. 1000) of positive
#tweets and 20% (i.e. 1000) of negative tweets as the test set. The remaining negative and positive
#tweets will be taken as the training set.

In [10]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]
print(len(test_set), len(train_set)) # Output: (2000, 8000)

2000 8000


# Training Classifier and Calculating Accuracy

In [11]:
#We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the
#trained classifier using the test set.

In [12]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765
print (classifier.show_most_informative_features(10))

0.9915
Most Informative Features
                      :) = True              pos : neg    =   1101.8 : 1.0
                     via = True              pos : neg    =     41.0 : 1.0
                   arriv = True              pos : neg    =     23.8 : 1.0
                  commun = True              pos : neg    =     19.0 : 1.0
                     x15 = True              neg : pos    =     18.3 : 1.0
                     sad = True              neg : pos    =     18.1 : 1.0
                    damn = True              neg : pos    =     17.0 : 1.0
               goodnight = True              pos : neg    =     15.7 : 1.0
                     ugh = True              neg : pos    =     14.3 : 1.0
                opportun = True              pos : neg    =     14.3 : 1.0
None


# Testing Classifier with Custom Tweet

In [13]:
#We provide custom tweet and check the classification output of the trained classifier.
#The classifier correctly predicts both negative and positive tweets provided.

In [14]:
custom_tweet = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_tweet_set = bag_of_words(custom_tweet)
print (classifier.classify(custom_tweet_set)) # Output: neg
# Negative tweet correctly classified as negative
# probability result
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: neg
print (prob_result.prob("neg")) # Output: 0.941844352481
print (prob_result.prob("pos")) # Output: 0.0581556475194

custom_tweet = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_tweet_set = bag_of_words(custom_tweet)
print (classifier.classify(custom_tweet_set)) # Output: pos
# Positive tweet correctly classified as positive
# probability result
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: pos
print (prob_result.prob("neg")) # Output: 0.00131055449755
print (prob_result.prob("pos")) # Output: 0.998689445502

neg
<ProbDist with 2 samples>
neg
0.9312784824258974
0.06872151757410054
pos
<ProbDist with 2 samples>
pos
0.001076669004200699
0.9989233309958002


# Precision, Recall & F1-Score

In [15]:
from collections import defaultdict
actual_set = defaultdict(set)
predicted_set = defaultdict(set)
actual_set_cm = []
predicted_set_cm = []
for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)
predicted_label = classifier.classify(feature)
predicted_set[predicted_label].add(index)
predicted_set_cm.append(predicted_label)
from nltk.metrics import precision, recall, f_measure, ConfusionMatrix
print("pos precision:",precision(actual_set['pos'], predicted_set['pos'])) 
print("pos recall:",recall(actual_set['pos'], predicted_set['pos'])) 
print("pos F-measure:", f_measure(actual_set['pos'], predicted_set['pos']))
print("neg precision:", precision(actual_set['neg'], predicted_set['neg'])) 
print("neg recall:", recall(actual_set['neg'], predicted_set['neg']))
print("neg F-measure:", f_measure(actual_set['neg'], predicted_set['neg']))


pos precision: None
pos recall: 0.0
pos F-measure: None
neg precision: 1.0
neg recall: 0.001
neg F-measure: 0.001998001998001998


# Confusion Matrix

In [16]:
"""Confusion Matrix is a table that is used to describe the performance of the classifier.
Confusion Matrix is represented in the following format:
    '''
| Predicted NO | Predicted YES |
-----------+---------------------+---------------------+
Actual NO | True Negative (TN) | False Positive (FP) |
Actual YES | False Negative (FN) | True Positive (TP) |
-----------+---------------------+---------------------+
'''
The following output of the confusion matrix shows the following performance of our trained classifier:
– 761 negative tweets were correctly classified as negative (TN)
– 239 negative tweets were incorrectly classified as positive (FP )
– 231 positive tweets were incorrectly classified as negative (FN)
– 769 positive tweets were correctly classified as positive (TP)"""
    

"Confusion Matrix is a table that is used to describe the performance of the classifier.\nConfusion Matrix is represented in the following format:\n    '''\n| Predicted NO | Predicted YES |\n-----------+---------------------+---------------------+\nActual NO | True Negative (TN) | False Positive (FP) |\nActual YES | False Negative (FN) | True Positive (TP) |\n-----------+---------------------+---------------------+\n'''\nThe following output of the confusion matrix shows the following performance of our trained classifier:\n– 761 negative tweets were correctly classified as negative (TN)\n– 239 negative tweets were incorrectly classified as positive (FP )\n– 231 positive tweets were incorrectly classified as negative (FN)\n– 769 positive tweets were correctly classified as positive (TP)"

In [17]:
cm=ConfusionMatrix(actual_set_cm, predicted_set_cm)
print(cm)
print(cm.pretty_format(sort_by_count=True, show_percents=True,truncate=9))


ValueError: Lists must have the same length.