The twitter_samples corpus contains 3 files.

1) negative_tweets.json: contains 5k negative tweets
2) positive_tweets.json: contains 5k positive tweets
3) tweets.20150430-223406.json: contains 20k positive and negative tweets

In [1]:
from nltk.corpus import twitter_samples
print (twitter_samples.fileids())

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


In [2]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
print (len(pos_tweets)) # Output: 5000
 
neg_tweets = twitter_samples.strings('negative_tweets.json')
print (len(neg_tweets)) # Output: 5000
 
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
print (len(all_tweets)) # Output: 20000
 
for tweet in pos_tweets[:5]:
    print (tweet)


5000
5000
20000
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
@97sides CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days


Three different parameters can be passed while calling the TweetTokenizer class. They are:

preserve_case: if False then it converts tweet to lowercase and vice-versa.
strip_handles: if True then it removes twitter handles from the tweet and vice-versa.
reduce_len: if True then it reduces the length of words in the tweet like hurrayyyy, yipppiieeee, etc. and vice-versa.

In [3]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
 
for tweet in pos_tweets[:5]:
    print (tweet_tokenizer.tokenize(tweet))

['#followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hey', 'james', '!', 'how', 'odd', ':/', 'please', 'call', 'our', 'contact', 'centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'many', 'thanks', '!']
['we', 'had', 'a', 'listen', 'last', 'night', ':)', 'as', 'you', 'bleed', 'is', 'an', 'amazing', 'track', '.', 'when', 'are', 'you', 'in', 'scotland', '?', '!']
['congrats', ':)']
['yeaaah', 'yipppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days']


#clean the tweets 
– Remove stock market tickers like $GE
– Remove retweet text “RT”
– Remove hyperlinks
– Remove hashtags (only the hashtag # and not the word)
– Remove stop words like a, and, the, is, are, etc.
– Remove emoticons like :), :D, :(, :-), etc.
– Remove punctuation like full-stop, comma, exclamation sign, etc.
– Convert words to Stem/Base words using Porter Stemming Algorithm. E.g. words like ‘working’, ‘works’, and ‘worked’ will be converted to their base/stem word “work”.

In [4]:
import string
import re
 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
 
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean
 
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
 
# print cleaned tweet
print (clean_tweets(custom_tweet))

['hello', 'great', 'day', 'good', 'morn']


In [5]:
print (pos_tweets[5])

@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM


In [6]:

print (clean_tweets(pos_tweets[5]))

['one', 'irresist', 'flipkartfashionfriday']


In [7]:
# feature extractor function
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary
 
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
print (bag_of_words(custom_tweet))

{'hello': True, 'great': True, 'day': True, 'good': True, 'morn': True}


In [8]:
# positive tweets feature set
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    
 
# negative tweets feature set
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
 
print(len(pos_tweets_set), len(neg_tweets_set))

5000 5000


In [9]:
#create test and train set

In [10]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle 
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
 
test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]
 
print(len(test_set),  len(train_set)) # Output: (2000, 8000)

2000 8000


In [11]:
#training the naive bayes classifier

In [12]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765
 
print (classifier.show_most_informative_features(10))  

0.741
Most Informative Features
                     sad = True              neg : pos    =     39.0 : 1.0
                   arriv = True              pos : neg    =     37.0 : 1.0
                     bam = True              pos : neg    =     26.3 : 1.0
                     x15 = True              neg : pos    =     17.0 : 1.0
                    glad = True              pos : neg    =     15.8 : 1.0
                 appreci = True              pos : neg    =     15.7 : 1.0
                  commun = True              pos : neg    =     15.0 : 1.0
                      aw = True              neg : pos    =     14.2 : 1.0
                opportun = True              pos : neg    =     13.7 : 1.0
                    miss = True              neg : pos    =     12.6 : 1.0
None


In [13]:
#testing classifier with a custom tweet
custom_tweet = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_tweet_set = bag_of_words(custom_tweet)
print (classifier.classify(custom_tweet_set))
# Negative tweet correctly classified as negative
 

neg


In [14]:
# probability result
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result) # Output: 
print (prob_result.max()) 
print (prob_result.prob("neg"))
print (prob_result.prob("pos")) 
 

<ProbDist with 2 samples>
neg
0.8883737135505989
0.11162628644939976


In [15]:
 
custom_tweet = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_tweet_set = bag_of_words(custom_tweet)
 
print (classifier.classify(custom_tweet_set)) # Output: pos
# Positive tweet correctly classified as positive
 
# probability result
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result) 
print (prob_result.max()) 
print (prob_result.prob("neg"))
print (prob_result.prob("pos"))

pos
<ProbDist with 2 samples>
pos
0.0006600524667181052
0.9993399475332831


In [16]:
#testing the model accuracy, deciding the parameters
#Accuracy = (TP + TN) / (TP + TN + FP + FN)
#Precision = (TP) / (TP + FP)
#Recall = (TP) / (TP + FN)
#F1 Score = 2 * (precision * recall) / (precision + recall)

In [19]:
from collections import defaultdict
 
actual_set = defaultdict(set)
predicted_set = defaultdict(set)
 
actual_set_cm = []
predicted_set_cm = []
 
for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)
 
    predicted_label = classifier.classify(feature)
 
    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)
    

In [20]:
from nltk.metrics import precision, recall, f_measure, ConfusionMatrix
 
print ('pos precision:', precision(actual_set['pos'], predicted_set['pos'])) # Output: pos precision: 0.762896825397
print ('pos recall:', recall(actual_set['pos'], predicted_set['pos'])) # Output: pos recall: 0.769
print ('pos F-measure:', f_measure(actual_set['pos'], predicted_set['pos']) )# Output: pos F-measure: 0.76593625498
 
print ('neg precision:', precision(actual_set['neg'], predicted_set['neg'])) # Output: neg precision: 0.767137096774
print ('neg recall:', recall(actual_set['neg'], predicted_set['neg'])) # Output: neg recall: 0.761
print ('neg F-measure:', f_measure(actual_set['neg'], predicted_set['neg'])) # Output: neg F-measure: 0.7640562249

pos precision: 0.7372047244094488
pos recall: 0.749
pos F-measure: 0.7430555555555555
neg precision: 0.7449186991869918
neg recall: 0.733
neg F-measure: 0.7389112903225806


In [21]:
#making the confusion matrix

'''
           |   Predicted NO      |   Predicted YES     |
-----------+---------------------+---------------------+
Actual NO  | True Negative (TN)  | False Positive (FP) |
Actual YES | False Negative (FN) | True Positive (TP)  |
-----------+---------------------+---------------------+
'''

'\n           |   Predicted NO      |   Predicted YES     |\n-----------+---------------------+---------------------+\nActual NO  | True Negative (TN)  | False Positive (FP) |\nActual YES | False Negative (FN) | True Positive (TP)  |\n-----------+---------------------+---------------------+\n'

In [22]:
cm = ConfusionMatrix(actual_set_cm, predicted_set_cm)
print (cm)

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<733>267 |
pos | 251<749>|
----+---------+
(row = reference; col = test)



In [23]:
print (cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |      n      p |
    |      e      o |
    |      g      s |
----+---------------+
neg | <36.6%> 13.3% |
pos |  12.6% <37.5%>|
----+---------------+
(row = reference; col = test)



The output can be interpreted as:
– 761 negative tweets were correctly classified as negative (TN)
– 239 negative tweets were incorrectly classified as positive (FP)
– 231 positive tweets were incorrectly classified as negative (FN)
– 769 positive tweets were correctly classified as positive (TP)