## Twitter Sentiment Analysis

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
import pickle

#### 1. Tokenizing
Spliting the sentence into words (word_tokenize) the paragraph into lines (sent_tokenize)

In [169]:
example = 'Natural Programming Language has a great future.'
print(word_tokenize(example))

['Natural', 'Programming', 'Language', 'has', 'a', 'great', 'future', '.']


In [170]:
example = 'Winter is coming. I need a coat'
print(sent_tokenize(example))

['Winter is coming.', 'I need a coat']


#### 2. Stop words
Stopwords are the words which have no significant effect on the meaning of a sentence.

In [171]:
stop_words = set(stopwords.words('english'))
example = 'I am a big fan of the motorsport'
words = word_tokenize(example)
for word in words:
    if word not in stop_words:
         print(word)

I
big
fan
motorsport


#### 3. Part of Speed tagging
We use part of speech tagging to tag the entity like a verb, adjective, noun, adverb, preposition etc.

In [172]:
example = "Thank you very much. Mr. Speaker, Mr. President, distinguished members of Congress, honored guests and fellow citizens. May I congratulate all of you who are members of this historic 100th           Congress of the United States of America. In this 200th anniversary year of our Constitution, you and I stand on the shoulders of giants–men whose words and deeds put wind in the sails of freedom."
tagged = []
words = []
for lines in sent_tokenize(example):
    for word in word_tokenize((lines)):
        words.append(word)
tagged.append(nltk.pos_tag(words))
print(tagged)
    

[[('Thank', 'NNP'), ('you', 'PRP'), ('very', 'RB'), ('much', 'RB'), ('.', '.'), ('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Mr.', 'NNP'), ('President', 'NNP'), (',', ','), ('distinguished', 'VBD'), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('honored', 'VBD'), ('guests', 'NNS'), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), ('.', '.'), ('May', 'NNP'), ('I', 'PRP'), ('congratulate', 'VBP'), ('all', 'DT'), ('of', 'IN'), ('you', 'PRP'), ('who', 'WP'), ('are', 'VBP'), ('members', 'NNS'), ('of', 'IN'), ('this', 'DT'), ('historic', 'JJ'), ('100th', 'JJ'), ('Congress', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('of', 'IN'), ('America', 'NNP'), ('.', '.'), ('In', 'IN'), ('this', 'DT'), ('200th', 'CD'), ('anniversary', 'JJ'), ('year', 'NN'), ('of', 'IN'), ('our', 'PRP$'), ('Constitution', 'NNP'), (',', ','), ('you', 'PRP'), ('and', 'CC'), ('I', 'PRP'), ('stand', 'VBP'), ('on', 'IN'), ('the', 'DT'), ('shoulders', 'NNS'), ('o

#### 4. Stemming
Stripping down a word to its root form

In [173]:
ps = PorterStemmer()
example_words = ['python', 'pythoning', 'pythoned', 'pythoner']
for w in example_words:
    print(ps.stem(w))

# makes the program consume less memory

python
python
python
python


#### 5. Frequency Distribution
To know the occurrence of a word in an article or most common x words in an article.

In [174]:
example = "Thank you very much. Mr. Speaker, Mr. President, distinguished members of Congress, honored guests and fellow citizens. May I congratulate all of you who are members of this historic 100th Congress of the United States of America. In this 200th anniversary year of our Constitution, you and I stand on the shoulders of giants–men whose words and deeds put wind in the sails of freedom."
words = [ ]
for lines in sent_tokenize(example):
    for word in word_tokenize((lines)):
        words.append(word)
words_dist = (nltk.FreqDist(words))
print(words_dist.most_common(5))

[('of', 8), ('.', 4), (',', 4), ('you', 3), ('and', 3)]


#### 6. Wordnet
Wordnet is a huge collection of synsets, meanings, definition, examples, synonyms, antonyms etc

In [175]:
from nltk.corpus import wordnet

In [176]:
#Synsets
words = wordnet.synsets('big')
words

[Synset('large.a.01'),
 Synset('big.s.02'),
 Synset('bad.s.02'),
 Synset('big.s.04'),
 Synset('big.s.05'),
 Synset('big.s.06'),
 Synset('boastful.s.01'),
 Synset('big.s.08'),
 Synset('adult.s.01'),
 Synset('big.s.10'),
 Synset('big.s.11'),
 Synset('big.s.12'),
 Synset('big.s.13'),
 Synset('big.r.01'),
 Synset('boastfully.r.01'),
 Synset('big.r.03'),
 Synset('big.r.04')]

In [177]:
#Definition
print(words[0].definition())

above average in size or number or quantity or magnitude or extent


In [178]:
#Examples
print(words[0].examples())

['a large city', 'set out for the big city', 'a large sum', 'a big (or large) barn', 'a large family', 'big businesses', 'a big expenditure', 'a large number of newspapers', 'a big group of scientists', 'large areas of the world']


In [179]:
#Synonyms and Antonyms
synonyms = []
antonyms = []
for words in wordnet.synsets('big'):
    for word in words.lemmas():
        synonyms.append(word.name())
        if word.antonyms():
            antonyms.append(word.antonyms()[0].name())
print('synonyms: {}'.format(set(synonyms)))
print('\n')
print('antonyms: {}'.format(set(antonyms)))
        

synonyms: {'bragging', 'great', 'bighearted', 'with_child', 'large', 'vainglorious', 'magnanimous', 'giving', 'braggart', 'handsome', 'swelled', 'bad', 'self-aggrandizing', 'freehanded', 'adult', 'heavy', 'gravid', 'vauntingly', 'openhanded', 'enceinte', 'fully_grown', 'grown', 'prominent', 'self-aggrandising', 'crowing', 'boastful', 'big', 'cock-a-hoop', 'boastfully', 'liberal', 'braggy', 'expectant', 'full-grown', 'grownup', 'bountiful', 'bounteous'}


antonyms: {'small', 'little'}


## Preprocessing the Data and Pickle
We will load positive and negative movie reviews, split them into words and then process it in such a way that efficiency will increase. After the completion of the process we will pickle it so that we don’t need to process it again and again.

In [180]:
stop_words = set(stopwords.words('english'))

In [181]:
# Here we are loading the datasets which consist of positive and negative reviews.
neg_rev = open(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\negative.txt', 'rb').read()
pos_rev = open(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\positive.txt', 'rb').read()

# First of all, we split the data by a new line using splitlines() then save in pos and neg.
pos = []
neg = []
for rev in pos_rev.splitlines():
    pos.append(rev)
for rev in neg_rev.splitlines():
    neg.append(rev)
    
# Convert the lines into words using word_tokenize() function and save these words in pos_words and neg_words.
pos_words = []
neg_words = []

for pos_line in pos:
    pos_words.append(word_tokenize(str(pos_line)))
for neg_line in neg:
    neg_words.append(word_tokenize(str(pos_line)))

# Since the words are arranged in a list of list, then first we iterate to each list or line 
# and then each word in that line and save the words in pos_word_new and neg_word_new.
pos_words_new = []
neg_words_new = []

for line in pos_words:
    for words in line:
        pos_words_new.append(words)
        
for line in neg_words:
    for words in line:
        neg_words_new.append(words)

# Now we have all the words that are in the datasets but the number is huge and most of the words are stopwords
# Remove stopwords and save this new list of refined words in pos_words_new_stopwords and neg_words_new_stopwords.
pos_words_new_stopwords = []
neg_words_new_stopwords = []

for words in pos_words_new:
    if words not in stop_words:
        pos_words_new_stopwords.append(words)
        
for words in neg_words_new:
    if words not in stop_words:
        neg_words_new_stopwords.append(words)

At this moment we have a refined list of words which belong to positive and negative reviews but still, there are some words like actors names, car names and all sort of words which are redundant for us. So, in this step, we are tagging the words.
We are only interested in Adjective(JJ), because they are the main words which show the positivity or negativity of a sentence, like awesome, worst, good, bad etc. then save them to pos_adj and neg_adj.

In [182]:
tagged_pos = []
tagged_neg = []
pos_adj = []
neg_adj = []

tagged_pos.append(nltk.pos_tag(pos_words_new_stopwords))
for i in range(len(tagged_pos[0])):
    if tagged_pos[0][i][1] == 'JJ':
        try:
            pos_adj.append((tagged_pos[0][i][1]))
        except Exception as e:
            print(str(e))
            
tagged_neg.append(nltk.pos_tag(neg_words_new_stopwords))
for i in range(len(tagged_neg[0])):
    if tagged_neg[0][i][1] == 'JJ':
        neg_adj.append((tagged_neg[0][i][1]))
    

Still, there are many words which are in both positive set and negative set, but the number of occurrence plays an important role. For example:

1. This movie is good
2. This movie is not good,

One is positive but another is negative but 'good' is in both the sets. If we take a singular instance of each word for each set then “good” become null and void, so to counter this, if any word is in the positive set and in the negative set then we remove it in both the sets.

For example, if there are 30 “good” in positive set but 5 “good” in negative set then we will end up with 25 “good” in positive set and 0 “good” in negative set. This will reduce words in our data set and increase the speed.

In [183]:
for i in pos_adj:
    for i in neg_adj:
        pos_adj.remove(i)
        neg_adj.remove(i)

We will now find the synonyms for each word in positive data set and append these synonyms words in a new list pos_syn and neg_syn.

In [184]:
pos_syn = []
neg_syn = []

for words in pos_adj:
    for syn in wordnet.synsets(words):
        for syn_word in syn.lemmas():
            pos_syn.append(syn_word.name)
            
for words in neg_adj:
    for syn in wordnet.synsets(words):
        for syn_word in syn.lemmas():
            neg_syn.append(syn_word.name)

In this step, we are using the set() function. This function reduces any number of instance of a word to one. Like if the data contains 401 words out of which 400 “good” word and 1 “bad” word then after this operation, the dataset will contain only 2 words; “good” and “bad”.  

Note that we are only applying the set() function on synonym words. We are also converting set dataset into a list using type casting

In [185]:
pos_syn = list(set(pos_syn))
neg_syn = list(set(neg_syn))

for words in pos_adj:
    pos_syn.append(words)
for words in neg_adj:
    neg_syn.append(words)

In this step, we are using the FreqDist() function from NLTK. It returns the word and number of occurrence of that word in the list.
By using dict typecasting, we will get a dictionary in which the key is the word and value is the number of occurrences.
We need the number of occurrences to increase the weight of a word in the analysis.

In [186]:
pos_adj_FreqDist = dict(nltk.FreqDist(pos_syn))
neg_adj_FreqDist = dict(nltk.FreqDist(neg_syn))

We will now search for the same words in both the dictionary and if found then we delete it and adjust the values, just like we did in cell 77

In [187]:
pos_dict = {}
neg_dict = {}
count = 0

for key1, value1 in pos_adj_FreqDist.items():
    for key2, value2 in neg_adj_FreqDist:
        if key1 == key2:
            count += 1
            if(value1 > value2):
                value1 = value1 - value2
                value2 = 0
                pos_dict.update({key1:value1})
            elif (value2 > value1):
                value2 = value2 - value1
                value1 = 0
                neg_dict.update({key2:value2})
                

Now there are many words which may be a name, or address, or anything else which don’t contribute any part in the analysis. So for this, we are again tagging each word with its part of speech, and extracting the adjective as we did earlier just to double check.

In [188]:
# we assign a part of speech tag to the keys of positive & negative dictionry and save them into a list.
tagged_neg_dict = []
tagged_pos_dict = []
tagged_neg_dict_list = []
tagged_pos_dict_list = []

tagged_pos_dict.append(nltk.pos_tag(pos_dict.keys()))
for i in range(len(tagged_pos_dict[0])):
    if tagged_neg_dict_list[0][i][1]=="JJ":
        tagged_pos_dict_list.append(tagged_pos_dict[0][i][0])
                                                       
tagged_neg_dict.append(nltk.pos_tag(neg_dict.keys()))
for i in range(len(tagged_neg_dict[0])):
    if tagged_neg_dict[0][i][1]=="JJ":
        tagged_neg_dict_list.append(tagged_neg_dict[0][i][0])

Now we have a list of adjectives, in this step, we are making a combination of these adjectives and number of occurrence.
Since we have pos_dict and neg_dict in which there are all the words and values, we will now find these adjectives in this list and if found then copy the values and making a new and final dictionary.     

In [189]:
pos_dict_updated = []
neg_dict_updated = []

for key1, value1 in pos_dict.items():
    for i in range(len(tagged_pos_dict_list)):
        if key1 == tagged_pos_dict_list[1]:
            pos_dict_updated.update({key1:value1})

for key1, value1 in neg_dict.items():
    for i in range(len(tagged_neg_dict_list)):
        if key1 == tagged_neg_dict_list[1]:
            neg_dict_updated.update({key1:value1})


Finally, we have a dictionary in which key is the adjective words and value is the number of occurrences. In this step, we are pickling these dictionaries so that we don’t have to make them again and again.

In [190]:
pickle_in = open(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\pos-yy_adj.pickle', 'wb')
pickle.dump(pos_dict_updated, pickle_in)
pickle_in.close()

pickle_in = open(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\neg-yy_adj.pickle', 'wb')
pickle.dump(neg_dict_updated, pickle_in)
pickle_in.close()

## Creating a Predicting Function and testing it.
Create a function to predict the nature of sentences. Our approach will be to split the words in the sentence and then count the number of occurrence of these words. If the sentence contains more positive words then it’s a positive type, else it will consider as negative type,

In [230]:
# Loading our pickles which we saved previously and then saving to a variable called pos_adj and neg_adj.
pickle_out = open(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\pos-yy_adj.pickle', 'rb')
pos_adj = pickle.load(pickle_out)
pickle_out = open(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\neg-yy_adj.pickle', 'rb')
neg_adj = pickle.load(pickle_out)

Here we are creating a function which we will use to predict the nature of tweets, we call this function “check” then we initialize counter variables, which we will use later.

In [238]:
def predict(example):
    pos_count = 0
    neg_count = 0
    
# spliting the lines into words. and saving them in ex_words.
    ex_words = word_tokenize(example)

# Iterating each word in the list of ex_words and then search if the word is in stop words or not.
# If not we search the occurrence of the word in our pos_dict key set, if found we add the value of occurrence with pos_count. 
# Similarly, we search for the word in the negative set and add the value with neg_count.
    global count
    for ex_word in ex_words:
        if ex_word.lower() not in stop_words:
            for key, value in pos_dict.items():
                if key == ex_word.lower():
                    pos_count += value
            for key, value in neg_dict.items():
                if key == ex_word.lower():
                    neg_count += value
    
    if pos_count>neg_count:
        conf=pos_count-neg_count
        checker="pos"
                     
    elif pos_count<neg_count:
        conf=neg_count-pos_count
        checker="neg"
        
    elif pos_count==neg_count:
        checker="None"
        conf=0
        
# In the end, we return two variables, checker & conf. 
# checker is pos, neg or none, while conf is difference between pos_count & neg_count.
# In case of None, the conf becomes zero.
    return checker, conf


ABOVE: if the value of pos_count is greater then the neg_count then we can conclude that sentence has more positive words thus the sentence is positive type similarly we can check for negative.
We also calculate the difference between both the counters and save it in confidence or "conf" variable. Higher the conf value, higher the accuracy, We can use it to add another layer of reliability.

Now if the pos_count and neg_count become equal then we conclude that the sentence is neutral or doesn’t have words which matches with our pos and neg dictionary so we toss them out and mark as ‘None’.

In [239]:
# Here we have three examples. Lets check the prediction values.
example_1 = "The movie is just a waste of time, it's complete junk, Totally waste of money."
example_2 = "The food of this restaurant is very good, I will recommend this place to everyone."
example_3 = "This is a low-quality product, even the reviews of this product is very poor."

print(predict(example_1))
print(predict(example_2))
print(predict(example_3))

('None', 0)
('None', 0)
('None', 0)


## Loading Tweets from Twitter using Tweepy

In [243]:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json

In [244]:
consumer_key = ''
consumer_secret = ''
access_token = ''
access_token_secret = ''

From predict function we tested we have three labels which we can assign to a tweet, if the tweet has more positive words, we will assign “pos”, if the tweet has more negative word then we assign “neg” and if in case the tweet contains both positive and negative value in equal amount then we  assign “None” label to the particular tweet.


In [245]:
def predict(example):
    pos_count = 0
    neg_count = 0
    
    ex_words = word_tokenize(example)

    for ex_word in ex_words:
        if ex_word.lower() not in stop_words:
            for key, value in pos_dict.items():
                if key == ex_word.lower():
                    pos_count += value
            for key, value in neg_dict.items():
                if key == ex_word.lower():
                    neg_count += value
    
    if pos_count>neg_count:
        conf=pos_count-neg_count
        checker="pos"
                     
    elif pos_count<neg_count:
        conf=neg_count-pos_count
        checker="neg"
        
    elif pos_count==neg_count:
        checker="None"
        conf=0
    
    return checker, conf

After assigning the labels,  we open a file and save these labels in it. We will use this file to read the labels and plot the graph in real time.

In [246]:
try:
    os.remove(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\test-yy_twitter.txt')    
except:
    output = open(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\test-yy_twitter.txt',"a")

We will now create a class which we will use to load the tweets, now there are two functions is the class. First one is on_data which we will use to defining the tweets variable and doing analysis, second is on_error which we will use if we encounter any error.

In [None]:
class listener(StreamListener):
    
    def on_data(self, data):
        tweets = json.loads(data)
        tweet = tweets['text']
        created_date = tweets['created_at']
        label, con = check(tweet)
        output = open(r'C:\Users\Hp\Desktop\Data Analysis\Sentiment Analysis\test-yy_twitter.txt',"a")
        output.write(label)
        output.write('\n')
        output.close()
        return True
    
    def on_error(self, status):
        if status == '420':
            print(status)
            print('Multiple Connections, Try again after sometime')
        else:
            print(status)
        
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

twitterStream = Stream(auth, listener())
twitterStream.filter(track = ['movie'])
output.close