## Sentiment analysis using Naive bayes

In [64]:
import pandas as pd
import numpy as np
import nltk
import pdb
from nltk.corpus import stopwords, twitter_samples
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

nltk.download("twitter_samples")
nltk.download("stopwords")

[nltk_data] Downloading package twitter_samples to C:\Users\SB
[nltk_data]     INFO\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\SB
[nltk_data]     INFO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [66]:

all_pos_tweets=twitter_samples.strings("positive_tweets.json")
all_neg_tweets=twitter_samples.strings("negative_tweets.json")

train_pos=all_pos_tweets[:4000]
test_pos=all_pos_tweets[4000:]
train_neg=all_neg_tweets[:4000]
test_neg=all_neg_tweets[4000:]

train_x=train_pos+train_neg
test_x=test_pos+test_neg

train_y=np.append(np.ones(len(train_pos)),np.zeros(len(train_neg)))
test_y=np.append(np.ones(len(test_pos)),np.zeros(len(test_neg)))



### preprocessing

In [67]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

import numpy as np # Library for linear algebra and math utils


def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    #tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


In [68]:
custom="@khushi i am happu to see you! i am http://chapagain.com.np"

# process tweet
print(process_tweets(custom))

['happu', 'see']


### building freq

In [69]:
def count_tweets(tweets,ys):
    result={}
    for y,tweet in zip(ys,tweets):
        for word in process_tweets(tweet):
            pair=(word,y)
            
            if pair in result:
                result[pair]+=1
                
            else:
                result[pair]=1
                
    return result
            

In [70]:
tweet=['i happy','you are good','well done','happy to see you','happy for you']
ys = [1, 0, 0, 0, 0]
count_tweets(tweet,ys)

{('happi', 1): 1,
 ('good', 0): 1,
 ('well', 0): 1,
 ('done', 0): 1,
 ('happi', 0): 2,
 ('see', 0): 1}

In [71]:
def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

### training model 

In [72]:
freqs=count_tweets(train_x,train_y)

In [73]:
def training(freqs,train_x,train_y):
    
    loglikelihood={}
    logprior=0
    
    vocab=set([pair[0] for pair in freqs.keys()])
#     vocab=set([pair[0] for pair in freqs.keys()])
    V=len(vocab)
    
    V_pos=V_neg=N_pos=N_neg=0
    
    for pair in freqs.keys():
        if pair[1]>0:
            V_pos+=1
            N_pos=freqs[pair]
        else:
            V_neg+=1
            N_neg=freqs[pair]
    
    for word in vocab:
        freq_pos=lookup(freqs,word,1)
        freq_neg=lookup(freqs,word,0)
        
        p_pos=(freq_pos+1)/(N_pos+V)
        p_neg=(freq_neg+1)/(N_neg+V)
        
        
        loglikelihood[word]=np.log(p_pos/p_neg)
    
    D=len(train_x)
    
    D_pos=len(list(filter(lambda x: x>0,train_y)))
    D_neg=len(list(filter(lambda x: x<=0,train_y)))
    
    logprior=np.log(D_pos)-np.log(D_neg)
    
    
    return logprior, loglikelihood
    

In [74]:
logprior, loglikelihood = training(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9161


In [75]:
type(freqs)

dict

### testing

In [76]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    
    word_l=process_tweets(tweet)
    
    p=0
    
    p+=logprior
    
    for word in word_l:
        if word in loglikelihood:
            p+=loglikelihood[word]
            
    return p

In [77]:
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.5686159179138455


In [78]:
def test_naive_bayes(test_y,test_x,log_prior,loglikelihood, naive_bayes_predict=naive_bayes_predict):
    accuracy=0
    y_hats=[]
    
    for tweet in test_x:
        if naive_bayes_predict(tweet,logprior,loglikelihood)>0:
            y_hat_i=1
        else:
            y_hat_i=0
        
        y_hats.append(y_hat_i)
        
    error=np.mean(np.absolute(y_hats-test_y))
    
    accuracy=1-error
    
    return accuracy
    

In [79]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes( test_y,test_x, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9955
