In [1]:
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
from nlp import preprocess_tweet, get_freq

# 1 - Data processing

In [2]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

X_train = positive_tweets[:4000] + negative_tweets[:4000] 
X_test = positive_tweets[4000:] + negative_tweets[4000:]

y_train = np.append(np.ones((4000)), np.zeros((4000)))
y_test = np.append(np.ones((1000)), np.zeros((1000)))

In [3]:
train_dict = get_freq(X_train,y_train)

# 2 - Naive Bayes

### 2.1 Log Prior
The prior probability represents the underlying probability in the target population that a tweet is positive versus negative.

$$D = document$$

$$D_{neg} = negative{\:}document$$

$$D_{pos} = positive{\:}document$$

$$P(D_{neg}) = \frac{D_{neg}}{D}$$
                                     
                                     
$$Prior Ratio = \frac{P(D_{pos})}{P(D_{neg})}$$


$$logPrior = log(Prior Ratio) = log(\frac{P(D_{pos})}{P(D_{neg})})$$

In this dataset the logPrior is 0, because we have the same number of (+) and (-) tweets

### 2.2 Positive and Negative Probability of a Word 

$$freq_{pos} = word{\:}count{\:}in{\:}the{\:}positive{\:}class{\:}$$

$$freq_{neg} = word{\:}count{\:}in{\:}the{\:}negative{\:}class{\:}$$

$$N_{pos} = number{\:}of{\:}possitive{\:}classes$$

$$N_{neg} = number{\:}of{\:}negative{\:}classes$$

$$V = number{\:}of{\:}words$$

$$P(W_{pos}) = \frac{freq_{pos}+1}{N_{pos}+V}$$

$$P(W_{neg}) = \frac{freq_{neg}+1}{N_{neg}+V}$$

$$Log{\:}likelihood = log(\frac{P(W_{pos})}{P(W_{neg})})$$

$$p_{W} = logPrior + Log{\:}likelihood$$

In [4]:
def p_words(tweets, labels, freq):
    '''
    returns the p for each word found in the tweets
    '''
    '''
    Preprocess
    '''
    tweets = [preprocess_tweet(tweet) for tweet in tweets]
    '''
    Log Prior parameters
    '''
    d = len(labels)
    d_pos = sum(labels)
    d_neg = len(labels) - d_pos
    log_prior = np.log((d_pos/d)/(d_neg/d))
    
    '''
    Log Likelihood parameters
    '''
    N_pos = 0
    N_neg = 0
    
    for pair in freq.keys():
        if pair[1] == 1:
            N_pos += 1
        else:
            N_neg += 1
        
    V = N_pos + N_neg
    
    vocab = [pair[0] for pair in freq.keys()]
    
    p = []

    for word in vocab:
        if (word,1) in freq.keys():
            p_word_pos = ((freq[(word,1)] + 1) + 1) / (N_pos + V)
        else:
            p_word_pos = ((0 + 1) + 1) / (N_pos + V)
            
        if (word,0) in freq.keys():
            p_word_neg = ((freq[(word,0)] + 1) + 1) / (N_neg + V)
        else:
            p_word_neg = ((0 + 1) + 1) / (N_pos + V)
            
        p.append(np.log(p_word_pos/p_word_neg) + log_prior)
        
           
    return p, vocab

In [5]:
p, vocab = p_words(X_train,y_train,train_dict)

In [6]:
print("The word",vocab[0],"has a log likelihood of", p[0])

The word followfriday has a log likelihood of 2.5257286443082556


In [7]:
def test_naive_bayes(tweets,labels,p,vocab):
    '''
    predict if a tweets is positive or negative with the p and vocab already trained
    '''
    '''
    Preprocess
    '''
    tweets = [preprocess_tweet(tweet) for tweet in tweets]
    
    predictions = []
    
    for tweet in tweets:
        
        p_tweet = 0
        
        for word in tweet:
            if word in vocab:
                idx = vocab.index(word)
                p_tweet += p[idx]
            
        if p_tweet > 0:
            predictions.append(1)
        else:
            predictions.append(0)
    
    accuracy = sum([predictions[i] == labels[i] for i in range(len(predictions))]) / len(predictions)
    
    return predictions, accuracy

In [8]:
predictions, accuracy = test_naive_bayes(X_test,y_test,p,vocab)

In [9]:
accuracy

0.997

In [10]:
test_tweets = ["I hated this movie, it was horrible.",
               "He seems to be very happy, I am glad about that",
               "I can't stand pop music, it makes me angry",
               "I am very happy because I think this Naive Bayes classificator is going to work very well"]
test_labels = np.array([0,1,0,1])
predictions, accuracy = test_naive_bayes(test_tweets,test_labels,p,vocab)

In [11]:
predictions, accuracy

([0, 1, 0, 1], 1.0)