In [1]:
import pandas as pd
import numpy as np
import string
import re

import psalm_scraper
import psalm_preprocessor as prp

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer        # module for stemming

In [2]:
first = 1
last = 150

#get corpus
ps_dict = psalm_scraper.create_ps_dict(first_psalm = first, last_psalm = last, verbose = False)

In [3]:
#build frequencies
psalms = list(ps_dict.values())
outcomes_df = pd.read_csv('psalms_sentiment.csv')
outcomes = np.array(outcomes_df['positive'])

In [4]:
freqs = prp.build_freqs(psalms, outcomes)

In [5]:
def extract_features(psalm, freqs, process_psalm=prp.process_psalm):
    '''
    Input: 
        tweet: a string containing one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_psalm(psalm)
    
    # 3 elements for [bias, positive, negative] counts
    x = np.zeros(2) 
        
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0] += freqs[(word,1)] if (word,1) in freqs else 0
        
        # increment the word count for the negative label 0
        x[1] += freqs[(word,0)] if (word,0) in freqs else 0
            
    x = x[None, :]  # adding batch dimension for further processing
    assert(x.shape == (1, 2))
    return x

In [6]:
X = np.zeros((len(psalms), 2))
for p in range(len(psalms)):
    X[p, :] = extract_features(psalms[p], freqs)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, outcomes, test_size=0.2)

In [14]:
psalm_classifier = LogisticRegression().fit(X_train, y_train)

In [15]:
predictions = psalm_classifier.predict(X_test)

In [16]:
predictions

array([1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1], dtype=int64)

In [17]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score

In [18]:
print(f'Overall accuracy: {accuracy_score(y_test, predictions)}')
print(f'F1 score: {f1_score(y_test, predictions)}')
print(f'Recall on positive psalms score: {recall_score(y_test, predictions)}')
print(f'ROC AUC score: {roc_auc_score(y_test, predictions)}')

Overall accuracy: 0.8333333333333334
F1 score: 0.8571428571428571
Recall on positive psalms score: 0.8823529411764706
ROC AUC score: 0.8257918552036198
