In [72]:
import pandas as pd
import numpy as np
import string
import re
import random

import psalm_scraper as ps
import psalm_preprocessor as prp

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
first = 1
last = 150

#get corpus
ps_dict = psalm_scraper.create_ps_dict(first_psalm = first, last_psalm = last, verbose = False)

In [27]:
#build frequencies
psalms = np.array(list(ps_dict.values()))
outcomes_df = pd.read_csv('psalms_sentiment.csv')
outcomes = np.array(outcomes_df['positive'])

In [42]:
## define training set
random.seed(42)
my_list = list(range(150))
sample_size = int(len(my_list) * 0.8)

# Get the random sample
training_idx = random.sample(my_list, k=sample_size)
test_idx = [x for x in range(150) if x not in training_idx]

In [36]:
ps_train = psalms[training_idx]
y_train = outcomes[training_idx]

ps_test = psalms[test_idx]
y_test = outcomes[test_idx]

In [30]:
freqs = prp.build_freqs(ps_train, y_train)

In [53]:
d_pos = np.sum(y_train)
d_neg = np.sum(1-y_train)

In [54]:
log_prior = np.log(d_pos/d_neg)
log_prior

0.4054651081081644

In [60]:
vocab = []
[vocab.append(x[0]) for x in set(freqs.keys()) if x not in vocab]

#need set because freqs.keys() can contain duplicate words because of the way the freq dict is constructed
V = len(set(vocab))

In [61]:
N_pos, N_neg = 0, 0
for pair in freqs.keys():
    if pair[1] > 0:

        # Increment the number of positive words by the count for this (word, label) pair
        N_pos += freqs[pair]

    else:

        # increment the number of negative words by the count for this (word,label) pair
        N_neg += freqs[pair]


In [62]:
loglikelihood = {}
for word in vocab:
    # get the positive and negative frequency of the word
    freq_pos = freqs[(word, 1.0)] if (word, 1.0) in freqs else 0
    freq_neg = freqs[(word, 0.0)] if (word, 0.0) in freqs else 0

    # calculate the probability that each word is positive, and negative
    p_w_pos = (freq_pos + 1) / (N_pos + V)
    p_w_neg = (freq_neg + 1) / (N_neg + V)

    # calculate the log likelihood of the word
    loglikelihood[word] = np.log(p_w_pos/p_w_neg)

In [63]:
len(loglikelihood)

2003

In [73]:
def naive_bayes_predict(psalm, logprior, loglikelihood):
    '''
    Input:
        psalm: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    # process the tweet to get a list of words
    word_l = prp.process_psalm(psalm)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p

In [78]:
nb_pred = []
for ps in ps_test:
    p = naive_bayes_predict(ps, log_prior, loglikelihood)
    
    if p > 0:
        prediction = 1
    else:
        prediction = 0
    
    nb_pred.append(prediction)

In [79]:
nb_pred

[1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [80]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score

print(f'Overall accuracy: {accuracy_score(y_test, nb_pred)}')
print(f'F1 score: {f1_score(y_test, nb_pred)}')
print(f'Recall on positive psalms score: {recall_score(y_test, nb_pred)}')
print(f'ROC AUC score: {roc_auc_score(y_test, nb_pred)}')

Overall accuracy: 0.7333333333333333
F1 score: 0.8
Recall on positive psalms score: 0.8421052631578947
ROC AUC score: 0.69377990430622
