# Naïve Bayes algorithm:

1.  Create a frequencies dictionnary with:
     *  Key: (word, class)
     *  Value: the frequency with which that word is mapped to that class in the training set
2.  Count the number of positive and negative documents
3.  Get the vocabulary size
4.  Calculate the log prior
5.  Create a dictionary with the log likelihood of each word in the vocabulary
6.  Predict the class of a new document by adding the log prior and the log likelihood of each word from the document





# Data preprocessing

In [1]:
import numpy as np

import nltk
import string
import matplotlib.pyplot as plt
from nltk.corpus import sentence_polarity, stopwords
from nltk.stem import WordNetLemmatizer

# Download nltk resources
nltk.download("sentence_polarity")
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/sentence_polarity.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Import dataset
pos_ids = sentence_polarity.fileids('pos')
neg_ids = sentence_polarity.fileids('neg')
pos_sentences = sentence_polarity.sents(pos_ids)
neg_sentences = sentence_polarity.sents(neg_ids)
pos_sentences, len(pos_sentences), len(neg_sentences), 

([['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', "century's", 'new', '"', 'conan', '"', 'and', 'that', "he's", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'], ['the', 'gorgeously', 'elaborate', 'continuation', 'of', '"', 'the', 'lord', 'of', 'the', 'rings', '"', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co-writer/director', 'peter', "jackson's", 'expanded', 'vision', 'of', 'j', '.', 'r', '.', 'r', '.', "tolkien's", 'middle-earth', '.'], ...],
 5331,
 5331)

In [3]:
# Split train/test
train_pos, train_neg = pos_sentences[:4500], neg_sentences[:4500]
test_pos, test_neg = pos_sentences[4500:], neg_sentences[4500:]
train_sentences  = train_pos + train_neg
test_sentences = test_pos + test_neg
len(train_sentences), len(test_sentences)

(9000, 1662)

In [4]:
# Create sentiment labels
Y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
Y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)
Y_train.shape, Y_test.shape

((9000, 1), (1662, 1))

In [5]:
def process_sentence(sentence):  
    """
    Remove stopwords, punctuations and lemmatize

    :param sentence: Input sentence (String)
    :return: tokenized sentence (list)
    """ 
    stopwords_eng = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    clean_sentence = [lemmatizer.lemmatize(token) for token in sentence if token not in stopwords_eng and token not in string.punctuation]
    return clean_sentence

In [6]:
def create_word_freqs_dict(sentences, labels):
    """
    Create frequencies dictionary

    :param sentences: list of sentences
    :param labels: list of sentences' labels (0 or 1)
    :return: vocabulary frequencies dictionary
    """
    tok_sentences = [process_sentence(sentence) for sentence in sentences]
    word_freqs = {}
    for sentence, label in zip(tok_sentences, labels):
        for word in sentence:
            if not (word, label[0]) in word_freqs:
                word_freqs[(word, label[0])] = 0
            word_freqs[(word, label[0])] += 1

    sorted_word_freq = sorted(word_freqs.items(), key=lambda x:x[1], reverse=True)
    # return dictionary sorted by values 
    return dict(sorted_word_freq)

# Likelihood estimation with Laplace smoothing
---
$$ Likelihood = \frac{P(Pos)}{P(Neg)}\prod^m _{i=1} \frac{P(w_i|Pos)}{P(w_i|Neg)} $$

$$ P(w_i | class) = \frac{freq_{(w_i, class)} + 1}{N_{class} + V}  $$

---

*  $m$: number of words in the sequence
*  $N_{class} $: frequency of all words in a class
*  $V$: vocabulary size (number of unique words in the vocabulary)
*  Laplace smoothing: we add 1 to the numerator and V to the denominator to avoid multiplying by zero when we find a word that is not in our training vocabulary

We get the likelihood score for the sequence:
* if score > 1   =>   class 1 (positive)
* if score < 1   =>   class 0 (negative)
* if score = 1   =>   neutral

In [8]:
def get_conditional_probability(word, label, freqs_dict, n_class, vocab_size):
    """
    Compute the probability of a word given a class: P(word|class)

    :param word: string
    :param label: class label (0 or 1)
    :param freqs_dict: frequencies dictionary
    :param n_class: int class frequency
    :param vocab_size: int number of words in the vocabulary
    :return: conditional probability value (float)
    """
    return (freqs_dict.get((word, label), 0) + 1) / (n_class + vocab_size)

# Log likelihood

To avoid numerical flow issues with the likelihood product, we introduce the log:

---
$$ Loglikelihood = log\frac{P(Pos)}{P(Neg)} + \sum^m _{i=1} \log\frac{P(w_i|Pos)}{P(w_i|Neg)} $$

---
The first component of the equation is the log prior and represents the classes distribution accross the whole training set, that is, the ratio of positive/negative documents in the training set. For perfectly balanced datasets, this ratio will be 1 so its log will be 0 and we won't add anything to the log likelihood.

We get the log likelihood score for the sequence:
 * if score > 0   =>   class 1 (positive)
 * if score < 0   =>   class 0 (negative)
 * if score = 0   =>   neutral
    
Reminder: log properties:
   *  $log(xy) = log(x) + log(y)  $
   *  $log\frac{x}{y} = log(x) - log(y)$

In [9]:
def get_log_prior(labels):
    """
    Calculate te log prior

    :param labels: list of training labels
    :return: log prior value (float)
    """
    p_pos = sum(labels)
    p_neg = len(labels) - p_pos
    return np.log(p_pos) - np.log(p_neg)

In [27]:
def train(x_train, y_train):
    """
    Train the classifier

    :param x_train: list of training tokenized sentences (list of lists of strings)
    :param y_train: training labels
    :return: log prior value, log likelihood dictionary, frequencies dictionary
    """

    freqs_dict = create_word_freqs_dict(x_train, y_train)
    log_likelihood = {}

    # Get classes frequency
    num_pos = np.sum(np.array([freqs_dict[(word, label)] for (word, label) in freqs_dict.keys() if label == 1]))
    num_neg = np.sum(np.array([freqs_dict[(word, label)] for (word, label) in freqs_dict.keys() if label == 0]))

    # Get vocab size
    vocab = set([word for (word, _) in freqs_dict.keys()])
    vocab_size = len(vocab)

    # Get dataset log prior
    log_prior = get_log_prior(y_train)

    # Get log likelihood of each word
    for word in vocab:
        prob_pos = get_conditional_probability(word, 1, freqs_dict, num_pos, vocab_size)
        prob_neg = get_conditional_probability(word, 0, freqs_dict, num_neg, vocab_size)
        log_likelihood[word] = np.log(prob_pos) - np.log(prob_neg)

    return log_prior, log_likelihood, freqs_dict

In [35]:
log_prior, log_likelihood, freqs_dict = train(train_sentences, Y_train)
log_prior, len(log_likelihood), len(freqs_dict)

(array([0.]), 17842, 23730)

In [41]:
for pair, freq in list(freqs_dict.items())[:10]:
    print(pair, freq)

('film', 1.0) 771
('movie', 0.0) 728
('film', 0.0) 601
('movie', 1.0) 528
('like', 0.0) 377
('one', 0.0) 313
('one', 1.0) 307
('--', 0.0) 269
('--', 1.0) 267
('make', 1.0) 252


In [42]:
for word, log_lik in list(log_likelihood.items())[:10]:
    print(word, log_lik)

picture 0.03238838271919864
schweig 1.0871980487289168
scratching 0.3940508681689714
socio-political 0.6817329406207513
concludes 0.6817329406207513
sneak 0.3940508681689714
extensively -0.7045614204991395
fluidly 0.6817329406207513
elie -0.7045614204991395
first-timer -0.011414239939194104


In [48]:
log_likelihood["love"], log_likelihood["hate"], log_likelihood["terrible"], log_likelihood["amazing"]

(0.6288904592463718,
 -0.4969220557208942,
 -0.630453448345417,
 2.1286519235570776)

In [59]:
# 20 most positive words
log_lik_list = sorted(log_likelihood.items(), key=lambda x:x[1], reverse=True)
log_lik_list[:20]

[('riveting', 2.821799104117023),
 ('engrossing', 2.7294257839860077),
 ('lively', 2.6276430896760647),
 ('polished', 2.6276430896760647),
 ('vividly', 2.553535117522344),
 ('heartwarming', 2.553535117522344),
 ('wonderfully', 2.4734924098488076),
 ('challenging', 2.4734924098488076),
 ('resonant', 2.4734924098488076),
 ('frailty', 2.4734924098488076),
 ('nuance', 2.4734924098488076),
 ('startling', 2.3864810328591766),
 ('culture', 2.3864810328591766),
 ('richly', 2.3864810328591766),
 ('russian', 2.3864810328591766),
 ('tour', 2.3864810328591766),
 ('spare', 2.3864810328591766),
 ('detailed', 2.3864810328591766),
 ('jealousy', 2.3864810328591766),
 ('refreshing', 2.339961017224285)]

In [58]:
# 20 most negative words
log_lik_list[-20:]

[('pinocchio', -2.4093095127375648),
 ('tuxedo', -2.4093095127375648),
 ('unintentional', -2.4093095127375648),
 ('lousy', -2.4093095127375648),
 ('generic', -2.4093095127375648),
 ('offensive', -2.496320889727194),
 ('incoherent', -2.496320889727194),
 ('seagal', -2.496320889727194),
 ('boring', -2.496320889727195),
 ('pointless', -2.5763635974007304),
 ('tiresome', -2.5763635974007304),
 ('plodding', -2.5763635974007304),
 ('product', -2.5763635974007304),
 ('disguise', -2.5763635974007304),
 ('uninspired', -2.5763635974007304),
 ('bore', -2.7194644410414046),
 ('poorly', -2.844627583995411),
 ('badly', -2.9558532191056344),
 ('unfunny', -3.2695107779606767),
 ('flat', -3.5079218014056748)]

# Predict and evaluate

In [14]:
def predict(sentences, log_prior, log_likelihood):
    """
    Predict polarity labels for the input sequences (0=Negative, 1=Positive)

    :param sentences: list of tokenized sentences
    :param log_prior: log prior value
    :param log_likelihood: log likelihood dictionary
    :return: list of predicted polarity labels for the input sentences
    """
    sentences = [process_sentence(sentence) for sentence in sentences]
    predictions = []

    for sentence in sentences:
        pred = log_prior + np.sum(np.array([log_likelihood.get(word, 0) for word in sentence]))
        if pred > 0:
            predictions.append(1)
        else:
            predictions.append(0)

    return predictions

In [36]:
y_pred = predict(test_sentences, log_prior, log_likelihood)
y_pred[:5]

[1, 0, 1, 0, 1]

In [20]:
for sentence, label, pred_label in zip(test_sentences[:5], Y_test[:5], y_pred[:5]):
    print(sentence)
    print(f"Predicted label: {pred_label} ------- True label: {label[0]}")
    print()

['a', 'good', 'music', 'documentary', ',', 'probably', 'one', 'of', 'the', 'best', 'since', 'the', 'last', 'waltz', '.']
Predicted label: 1 ------- True label: 1.0

['if', 'the', 'plot', 'seems', 'a', 'bit', 'on', 'the', 'skinny', 'side', ',', "that's", 'because', 'panic', 'room', 'is', 'interested', 'in', 'nothing', 'more', 'than', 'sucking', 'you', 'in', 'and', 'making', 'you', 'sweat', '.']
Predicted label: 0 ------- True label: 1.0

['.', '.', '.', '[the', 'film]', 'works', ',', 'due', 'mostly', 'to', 'the', 'tongue-in-cheek', 'attitude', 'of', 'the', 'screenplay', '.']
Predicted label: 1 ------- True label: 1.0

['the', 'film', 'becomes', 'an', 'overwhelming', 'pleasure', ',', 'and', 'you', 'find', 'yourself', 'rooting', 'for', "gai's", 'character', 'to', 'avoid', 'the', 'fate', 'that', 'has', 'befallen', 'every', 'other', 'carmen', 'before', 'her', '.']
Predicted label: 0 ------- True label: 1.0

['broomfield', 'has', 'a', 'rather', 'unique', 'approach', 'to', 'documentary', '.',

In [21]:
# Accuracy = number of correct predicions / total number of predictions
def evaluate_accuracy(Y_gold, Y_pred):
    """
    Evaluate accuracy of the predictions

    :param Y_gold: actual labels. Numpy array of size (m, 1) (m=number of labels)
    :param Y_pred: predicted labels. Numpy array of size m
    :return: accuracy value (int)
    """
    return sum([1 for y_gold, y_pred in zip(Y_gold, Y_pred) if y_gold==y_pred]) / len(Y_pred)

In [22]:
evaluate_accuracy(Y_test, y_pred)

0.7617328519855595

In [50]:
# Visualize some test sentences and their classification
for sentence, label, pred_label in zip(test_sentences[826:836], Y_test[826:836], y_pred[826:836]):
    print(" ".join(sentence))
    print(f"Predicted label: {pred_label} ------- True label: {int(label[0])}")
    print()

both exuberantly romantic and serenely melancholy , what time is it there ? may prove to be [tsai's] masterpiece .
Predicted label: 1 ------- True label: 1

mazel tov to a film about a family's joyous life acting on the yiddish stage .
Predicted label: 1 ------- True label: 1

standing in the shadows of motown is the best kind of documentary , one that makes a depleted yesterday feel very much like a brand-new tomorrow .
Predicted label: 1 ------- True label: 1

it's nice to see piscopo again after all these years , and chaykin and headly are priceless .
Predicted label: 1 ------- True label: 1

provides a porthole into that noble , trembling incoherence that defines us all .
Predicted label: 1 ------- True label: 1

the whole mess boils down to a transparently hypocritical work that feels as though it's trying to set the women's liberation movement back 20 years .
Predicted label: 0 ------- True label: 0

' . . . the cast portrays their cartoon counterparts well . . . but quite frankl

# Useful functions

In [24]:
def get_ratio(word, freqs_dict):
    """
    Calculate ratio of positive/negative frequency of a word in the training set

    :param word: string
    :return: ratio (float) (>1: positive, <1: negetive)
    """
    return (freqs_dict.get((word, 1), 0) + 1) / (freqs_dict.get((word, 0), 0) + 1)

In [30]:
get_ratio("movie", freqs_dict), get_ratio("love", freqs_dict), get_ratio("terrible", freqs_dict)

(0.7256515775034293, 1.8970588235294117, 0.5384615384615384)

In [32]:
def get_words_by_threshold(label, threshold, freqs_dict):
    """
    Get vocabulary words that have a minimum level of positiveness/negativeness

    :param label: 1 for positive, 0 for negative
    :param threshold: that will be used as the cutoff for including a word in the returned dictionary
    :return: dictionary of filtered words (key) and their ratio (value)
    """
    filtered_words = {}
    for (word, _) in freqs_dict.keys():
        ratio = get_ratio(word, freqs_dict)
        if label == 1 and ratio >= threshold:
            filtered_words[word] = ratio
        elif label == 0 and ratio <= threshold:
            filtered_words[word] = ratio
    return filtered_words

In [33]:
# Get words with high positive ratio
get_words_by_threshold(1, 10, freqs_dict)

{'culture': 11.0,
 'engrossing': 15.5,
 'refreshing': 10.5,
 'absorbing': 10.0,
 'inventive': 10.0,
 'riveting': 17.0,
 'lively': 14.0,
 'polished': 14.0,
 'vividly': 13.0,
 'heartwarming': 13.0,
 'resonant': 12.0,
 'frailty': 12.0,
 'nuance': 12.0,
 'challenging': 12.0,
 'wonderfully': 12.0,
 'startling': 11.0,
 'detailed': 11.0,
 'russian': 11.0,
 'spare': 11.0,
 'jealousy': 11.0,
 'tour': 11.0,
 'richly': 11.0,
 'masterful': 10.0,
 'bourne': 10.0,
 'uncompromising': 10.0,
 'reminder': 10.0,
 'deft': 10.0,
 'superbly': 10.0}

In [34]:
# Get words with high negative ratio
get_words_by_threshold(0, 0.1, freqs_dict)

{'dull': 0.09836065573770492,
 'boring': 0.08333333333333333,
 'flat': 0.030303030303030304,
 'unfunny': 0.038461538461538464,
 'generic': 0.09090909090909091,
 'mediocre': 0.09523809523809523,
 'loud': 0.09523809523809523,
 'badly': 0.05263157894736842,
 'poorly': 0.058823529411764705,
 'bore': 0.06666666666666667,
 'product': 0.07692307692307693,
 'tiresome': 0.07692307692307693,
 'pointless': 0.07692307692307693,
 'plodding': 0.07692307692307693,
 'uninspired': 0.07692307692307693,
 'disguise': 0.07692307692307693,
 'incoherent': 0.08333333333333333,
 'seagal': 0.08333333333333333,
 'offensive': 0.08333333333333333,
 'lousy': 0.09090909090909091,
 'tuxedo': 0.09090909090909091,
 'unintentional': 0.09090909090909091,
 'pinocchio': 0.09090909090909091,
 'comparison': 0.1,
 'pile': 0.1,
 'ballistic': 0.1,
 'stiff': 0.1,
 'missed': 0.1,
 'leaden': 0.1,
 'plotting': 0.1,
 'lifeless': 0.1,
 'inane': 0.1,
 'soggy': 0.1}