# **SENTIMENT ANALYSIS USING NAIVE BAYES**

In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\paart\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **Creating Dataset**


In [3]:
data = [
    ("I love this movie", "positive"),
    ("This is a great product", "positive"),
    ("Amazing experience, very happy", "positive"),
    ("I hate this movie", "negative"),
    ("This is a terrible product", "negative"),
    ("Awful experience, very disappointed", "negative")
]

### **Cleaning Dataset**

In [4]:
def Clean(S):

    S = S.lower()                                                                                         # Convert to lowercase
    S = re.sub(r"[^a-z\s]", "", S)                                                                        # Removes the punctuations 

    tokens = S.split()                                                                                    # Get words from sentence 
    stop_words = set(stopwords.words("english"))                                                          # Get the commonly used english connecting words
    stop_words.discard("not")                                                                             # 'not' is important for sentiments
    PS = PorterStemmer()                                                                                  # Gets the root of the word  [fishing -> fish]

    words = [PS.stem(w) for w in tokens if not w in stop_words]                                           # List of cleaned words
    New_Sentence = ' '.join(words)                                                                        # Cleaned sentence

    return New_Sentence    

In [5]:
cleaned_data = []
for pair in data:
    sentence, sentiment = pair[0], pair[1]
    sentence = Clean(sentence)
    cleaned_data.append((sentence, sentiment))


words = []
for sentence, sentiment in cleaned_data:
    word_list = sentence.split(' ')
    words.extend(word_list)
words = sorted(list(set(words)))

# cleaned_data -> [ (sen, senti) ], words -> []

### **Creating Bag of Words Table**

In [7]:
def find(w, s):
    n = len(s)
    for i in range(0, n):
        if(w == s[i]):
            return i
    return -1

In [8]:
n_rows, n_cols = len(data), len(words) + 1                                      # n_cols = words + sentence sentiment (output)
table = [[0 for _ in range(n_cols)] for _ in range(n_rows)] 
word_count = [0 for _ in range(n_cols - 1)]

for i in range(n_rows):

    if(cleaned_data[i][1] == 'positive'):
        table[i][-1] = 1
    else:
        table[i][-1] = 0

    for w in cleaned_data[i][0].split(' '):
        idx = find(w, words)
        table[i][idx] = 1
        word_count[idx] += 1
table

[[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1],
 [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
 [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]

### **Naive Bayes**

In [9]:
prob_pos, prob_neg  = 0.5, 0.5

p_word_count = [0 for _ in range(n_cols - 1)]
n_word_count = [0 for _ in range(n_cols - 1)]

p_cleaned = cleaned_data[:3]
n_cleaned = cleaned_data[3:]

for i in range(len(p_cleaned)):
    for w in p_cleaned[i][0].split(' '):
        idx = find(w, words)
        p_word_count[idx] += 1
    for w in n_cleaned[i][0].split(' '):
        idx = find(w, words)
        n_word_count[idx] += 1

**NOTE:**
Some of the problems that arise when using a normal bayes probability calculation are :
- No measure for unknown words
- For confusing statements, in the same statement $p \_ word \_ count[i] = 0$ and $n \_ word \_ count[j] = 0$ which leads to $positive = 0$ and $negative = 0$. 

To solve this, 

Let $V$ = Vocabulary (Number of current known words) and T_C = Total words in C

$$
P(Unknown \_ Word \mid C) = \frac{0 + \epsilon}{T_C + \epsilon V}
$$

Furthermore, Laplacian Smoothing to have Non - Zero Probabilities, where $\epsilon$ is any non - zero value

$$
P(w_i \mid C) = \frac{count(w_i, C) + \epsilon}{T_C + \epsilon  V}

In [17]:
test_sentence = "This movie is terrible ! I love it !"
test_sentence = Clean(test_sentence)

positive, negative = prob_pos, prob_neg

for w in test_sentence.split(' '):
    idx = find(w, words)

    if(idx == -1):
            positive = positive * (1)/(len(p_word_count) + len(words))
            negative = negative * (1)/(len(n_word_count) + len(words))
    else:
        positive = positive * (p_word_count[idx] + 0.1) / (word_count[idx] + len(words))
        negative = negative * (n_word_count[idx] + 0.1) / (word_count[idx] + len(words))

In [18]:
print(f"The probability of a positive sentiment is {(positive/(positive + negative))*100}%")
print(f"The probability of a positive sentiment is {(negative/(positive + negative))*100}%")

The probability of a positive sentiment is 50.0%
The probability of a positive sentiment is 49.999999999999986%
