In [1]:
import os
import re
import math
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score



In [2]:


def negate(text):
    negation_words = ["n't", 'not', 'no', 'never']

    pattern = r'(\b(?:' + '|'.join(negation_words) + r')\b)(.*?)((?:[.,!?])|$)'

    def replace(match):
        negation_word, words, punctuation = match.groups()

        negated_words = ' NOT_'.join(words.split())
        return negation_word + ' NOT_' + negated_words + punctuation

    result = re.sub(pattern, replace, text)
    return result



In [3]:


def preprocess_text(text, wo_stop, binarization = False, negation = False):
    text = text.lower()
    if negation:
        text = negate(text)
    text = re.sub(r'[^a-zA-Z_\s]', '', text)
    words = text.split()
    stop_words = set(stopwords.words('english'))
    if wo_stop:
        words = [word for word in words if word not in stop_words]
    if binarization:
        words = list(set(words))
    return words


In [4]:


# training
def train_naive_bayes(training_folder, wo_stop, binarization = False, negation = False):
    logprior = defaultdict(float)
    count = defaultdict(lambda: defaultdict(float))
    loglikelihood = defaultdict(lambda: defaultdict(float))
    vocab = set()
    class_counts = {'pos': 0, 'neg': 0}
    total_docs = 0

    for class_label in ['pos', 'neg']:
        class_folder = training_folder
        for filename in os.listdir(class_folder):
            if not filename.startswith(class_label):
                continue
            with open(os.path.join(class_folder, filename), 'r') as file:
                text = file.read()
            words = preprocess_text(text, wo_stop, binarization, negation)
            total_docs += 1
            class_counts[class_label] += 1

            for word in words:
                vocab.add(word)
                count[word][class_label] += 1

    # print(class_counts)
    for class_label in class_counts:
        logprior[class_label] = math.log( class_counts[class_label] / total_docs)
        class_total_words = sum(count[word][class_label] for word in vocab)
        for word in vocab:
            loglikelihood[word][class_label] = math.log(
                (count[word][class_label] + 1) / (class_total_words + len(vocab)))

        # print(logprior[class_label])
    return logprior, loglikelihood, vocab


In [5]:

#testing 
def test_naive_bayes(test_folder, logprior, loglikelihood, vocab, wo_stop, binarization = False, negation = False):
    predictions = []

    c = 0
    w = 0
    for filename in os.listdir(test_folder):
        with open(os.path.join(test_folder, filename), 'r') as file:
            text = file.read()
        words = preprocess_text(text, wo_stop, binarization, negation)

        sumc = {'pos': 0, 'neg': 0}

        for class_label in ['pos', 'neg']:
            sumc[class_label] = logprior[class_label]
            for word in words:
                if word in vocab:
                    sumc[class_label] += loglikelihood[word][class_label]

        # 
        # 
        # (sumc)
        predicted_class = max(sumc, key=sumc.get)

        if (filename[0:3] == predicted_class):
            c += 1
        else:
            w += 1
            # if(len(text) < 2500):
                #  print(text, predicted_class)
        predictions.append(predicted_class)
    # print(c, w, c/(c+w))
    return predictions



In [6]:

training_data = 'train'
testing_data = 'test'


true_labels = []

for filename in os.listdir(testing_data):
    true_labels.append(filename[:3])

In [7]:

configurations = [
    {'wo_stop': False, 'binarization': False, 'negation': False}, # Bag of words method using word frequency as 1 
    {'wo_stop': False, 'binarization': True, 'negation': False}, # Bag of words method using word frequency as 1 (binarization)
    {'wo_stop': True, 'binarization': False, 'negation': False}, # Content word frequencies (ignoring function words)
    {'wo_stop': True, 'binarization': True, 'negation': False}, # Content word frequencies of 1 per word (ignoring function words after binarization)
    {'wo_stop': False, 'binarization': False, 'negation': True}, # Bag of words method using word frequencies after applying the negation feature
    {'wo_stop': False, 'binarization': True, 'negation': True} # Bag of words method using word frequency as 1 after applying the negation feature
]

results_table = []

for config in configurations:
    logprior, loglikelihood, vocab = train_naive_bayes(training_data, **config)
    predictions = test_naive_bayes(testing_data, logprior, loglikelihood, vocab, **config)
    
    cm = confusion_matrix(true_labels, predictions, labels=['pos', 'neg'])
    
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='macro')
    recall = recall_score(true_labels, predictions, average='macro')
    f1 = f1_score(true_labels, predictions, average='macro')
    
    results_table.append({
        'Configuration': config,
        'Confusion Matrix': cm,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

for result in results_table:
    print("Configuration:", result['Configuration'])
    print("Confusion Matrix:")
    print(result['Confusion Matrix'])
    print(" Accuracy:", result['Accuracy'])
    print(" Precision:", result['Precision'])
    print(" Recall:", result['Recall'])
    print(" F1 Score:", result['F1 Score'])
    print()


Configuration: {'wo_stop': False, 'binarization': False, 'negation': False}
Confusion Matrix:
[[127  33]
 [ 35 125]]
 Accuracy: 0.7875
 Precision: 0.7875449288951399
 Recall: 0.7875
 F1 Score: 0.7874916988944881

Configuration: {'wo_stop': False, 'binarization': True, 'negation': False}
Confusion Matrix:
[[124  36]
 [ 29 131]]
 Accuracy: 0.796875
 Precision: 0.79744432703221
 Recall: 0.796875
 F1 Score: 0.7967777549804105

Configuration: {'wo_stop': True, 'binarization': False, 'negation': False}
Confusion Matrix:
[[125  35]
 [ 35 125]]
 Accuracy: 0.78125
 Precision: 0.78125
 Recall: 0.78125
 F1 Score: 0.78125

Configuration: {'wo_stop': True, 'binarization': True, 'negation': False}
Confusion Matrix:
[[127  33]
 [ 29 131]]
 Accuracy: 0.80625
 Precision: 0.8064415259537211
 Recall: 0.8062499999999999
 F1 Score: 0.8062197218315362

Configuration: {'wo_stop': False, 'binarization': False, 'negation': True}
Confusion Matrix:
[[127  33]
 [ 38 122]]
 Accuracy: 0.778125
 Precision: 0.7783968