### Followed "Practical-7 Sentiment Analysis - Lab"  and extracted few functions from there to complete this project

In [1]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt            # library for visualization
import random                              # pseudo-random number generator

import numpy as np
import pandas as pd

import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

### i. Collect 50 Twitter posts
### ii. Classify those posts according to the impact on society as positive/ negative (1/0)

In [2]:
df  = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,tweet,label
0,@AmericanAir thank you for quick responses. ...,1
1,@AmericanAir I wait 2+ hrs for CS to call me b...,0
2,@AmericanAir I can't even get on the phone wit...,0
3,@AmericanAir Hmm. Looks like you looked at my ...,1
4,@americanair @bershawnjackson big UPS to Newar...,1


In [3]:
tweets = df['tweet'].to_list()

labels = df['label'].to_numpy()

tweets[:5], labels[:5]

(["@AmericanAir thank you for quick responses.   #aa usually has fantastic customer service. That's why I was so shocked when it wasn't there",
  '@AmericanAir I wait 2+ hrs for CS to call me back re why  flt is cxld/protection &amp; they hang up the minute I answer on 1st ring?',
  "@AmericanAir I can't even get on the phone with your reservations team. The system automatically disconnects us.",
  '@AmericanAir Hmm. Looks like you looked at my tweet from last month, not this one. Was able to get U.K. agent to help me, thanks.',
  '@americanair @bershawnjackson big UPS to Newark airport staff &amp;&amp; D.Dean. I was also treated lovely while I was in town. ❤❤'],
 array([1, 0, 0, 1, 1], dtype=int64))

In [4]:
pos = np.count_nonzero(labels==1)
print('Number of positive tweets: ', pos)
print('Number of negative tweets: ', len(labels) - pos)

Number of positive tweets:  25
Number of negative tweets:  25


In [5]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            
                word not in string.punctuation):  # remove punctuation
            
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            
            tweets_clean.append(stem_word)

    return tweets_clean

def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y, tw in zip(ys, tweets):
        for word in process_tweet(tw):
            pair = (word, y)
            if(pair in result):
                result[pair] += 1
            else:
                result[pair] = 1

    return result

In [6]:
train_x, test_x = tweets[:40], tweets[40:]
train_y, test_y = labels[:40], labels[40:]

In [7]:
# Build the freqs dictionary 
freqs = count_tweets({}, train_x, train_y)

In [8]:
print(freqs)

{('thank', 1): 10, ('quick', 1): 2, ('respons', 1): 3, ('aa', 1): 1, ('usual', 1): 1, ('fantast', 1): 1, ('custom', 1): 5, ('servic', 1): 5, ("that'", 1): 1, ('shock', 1): 1, ('wait', 0): 2, ('2', 0): 2, ('hr', 0): 2, ('cs', 0): 1, ('call', 0): 5, ('back', 0): 2, ('flt', 0): 1, ('cxld', 0): 1, ('protect', 0): 1, ('hang', 0): 1, ('minut', 0): 1, ('answer', 0): 2, ('1st', 0): 1, ('ring', 0): 1, ("can't", 0): 2, ('even', 0): 3, ('get', 0): 2, ('phone', 0): 3, ('reserv', 0): 2, ('team', 0): 1, ('system', 0): 1, ('automat', 0): 1, ('disconnect', 0): 1, ('us', 0): 2, ('hmm', 1): 1, ('look', 1): 2, ('like', 1): 1, ('tweet', 1): 1, ('last', 1): 1, ('month', 1): 1, ('one', 1): 1, ('abl', 1): 3, ('get', 1): 1, ('u', 1): 1, ('k', 1): 1, ('agent', 1): 1, ('help', 1): 3, ('big', 1): 2, ('up', 1): 1, ('newark', 1): 1, ('airport', 1): 1, ('staff', 1): 2, ('d.dean', 1): 1, ('also', 1): 1, ('treat', 1): 1, ('love', 1): 3, ('town', 1): 1, ('❤', 1): 2, ('fa', 1): 2, ('shawn', 1): 1, ('spectacular', 1): 1

### iii. Propose a model to classify a new post as socially influenced post or not

### Naive Bayes Model

In [9]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation.
    '''
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = 0
    
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # increment the count of unique positive words by 1
            V_pos += 1

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:
            # increment the count of unique negative words by 1
            V_neg += 1

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))

    # Calculate D_neg, the number of negative documents
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood

def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if(pair in freqs):
        n = freqs[pair]

    return n

In [10]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

print(logprior)
print(len(loglikelihood))

0.0
294


### iv. Evaluate the performance of propsed model

In [11]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p

def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.abs(y_hats - test_y))

    # Accuracy is 1 minus the error
    accuracy = 1 - error

    return accuracy

In [12]:
print("Naive Bayes accuracy = %0.4f" % (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.8000


In [13]:
# Experiment with your own tweet.
my_tweet = 'Flight was delayed.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is -3.046924453362179


In [14]:
my_tweet = 'I was happy about the service. Thank you :)'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

4.386348807505076
