# Exercise 4.3: Naïve Bayes Classification
Kevin King (Kevin.M.King.24@dartmouth.edu)<br>
Dartmouth College, LING48, Spring 2023

My Implementation (`runNBTest` function below):
* Read positive and negative reviews from specified files.
* Extract bag-of-words features from the reviews.
* Split the data into training and testing sets based on a given cutoff.
* Train a Naive Bayes classifier on the training set.
* Evaluate the classifier's performance on the testing set:
    * Calculate accuracy, precision, recall, and F-measure for positive and negative classes.
    * Print the evaluation metrics.
* Show the most informative features of the trained classifier.

In [1]:
# Import libraries
import itertools
import collections
from nltk import word_tokenize
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.metrics.scores import precision, recall, f_measure
from nltk.collocations import BigramCollocationFinder
import gdown

In [2]:
# Download the 'punkt' library for NLTK
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/kevin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Download files
url = "https://drive.google.com/uc?id=1fDzyiMACkdQl9gRwRQrL0jxI35o9K8-H"
output = 'hw5-nb-files.zip'
gdown.download(url, output, quiet=False)
!unzip -j $output

Downloading...
From: https://drive.google.com/uc?id=1fDzyiMACkdQl9gRwRQrL0jxI35o9K8-H
To: /Users/kevin/Desktop/Dartmouth/2022-23/23S/CS72/HW4/templates-hw4/hw5-nb-files.zip
100%|████████████████████████████████████████| 424k/424k [00:00<00:00, 8.88MB/s]

Archive:  hw5-nb-files.zip
replace google-pos.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 




^C


#### Function: `bigram_word_feats` 

In [11]:
# Function to construct a bag of words with both unigrams and bigrams
# https://streamhacker.com/2010/05/24/
# text-classification-sentiment-analysis-stopwords-collocations/
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
  
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)

    tupledWords = []
    for w in words:
        tempList = []
        tempList.append(w)
        tempTuple = tuple(tempList)
        tupledWords.append(tempTuple)

    return dict([(ngram, True) for ngram in itertools.chain(tupledWords, bigrams)])

#### Function: `runNBTest` 

In [20]:
def runNBTest(filenamePos, filenameNeg, cutoff, numFeats):
    # We will store the negative and positive reviews here	
    posReviewsText = []
    negReviewsText = []

    # Open the file containing the positive reviews
    filePos = open(filenamePos, "r")
    posReviewsText = filePos.readlines()

    # Open the file containing the negative reviews
    fileNeg = open(filenameNeg, "r")
    negReviewsText = fileNeg.readlines()

    # This will contain the bag-of-words for positive and negative reviews
    negfeats = []
    posfeats = []

    # For every positive review:
    # (1) tokenize it, (2) extract the bag-of-words as features, and (3) append it to the positive features.
    for f in posReviewsText:
        tokens = word_tokenize(f)
        wordFeats = bigram_word_feats(tokens)
        posfeats.append((wordFeats, 'pos'))

    # For every negative review:
    # (1) tokenize it, (2) extract the bag-of-words as features, and (3) append it to the negative features.
    for f in negReviewsText:
        tokens = word_tokenize(f)
        wordFeats = bigram_word_feats(tokens)
        negfeats.append((wordFeats, 'neg'))

    # Get the number of elements that will be in the training set
    negcutoff = int(len(negfeats) * cutoff)
    poscutoff = int(len(posfeats) * cutoff)

    # Make the training and testing sets
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    print('train on ' + str(len(trainfeats)) + ' instances, test on ' + str(len(testfeats)) + ' instances')

    # Make a classifier based on the training features
    classifier = NaiveBayesClassifier.train(trainfeats)

    # Create two blank dictionaries that will contain the goldLabels and the predictedLabels
    goldLabels = collections.defaultdict(set)
    predictedLabels = collections.defaultdict(set)
    
    # Get the gold labels and the model predictions for every item in the test set
    # and put the labels and the predictions in a Python dictionary
    for i, (feats, label) in enumerate(testfeats):
        # Add the gold labels to the goldLabels dictionary
        goldLabels[label].add(i)
        # Get the model's predictions (the "observed" labels)
        observed = classifier.classify(feats)
        # Add the model predictions to the predictedLabels dictionary
        predictedLabels[observed].add(i)

    # Calculate the precision, recall, and F-measure for the positive and negative sets
    posPrecision = precision(goldLabels['pos'], predictedLabels['pos'])
    posRecall = recall(goldLabels['pos'], predictedLabels['pos'])
    negPrecision = precision(goldLabels['neg'], predictedLabels['neg'])
    negRecall = recall(goldLabels['neg'], predictedLabels['neg'])
    negF = f_measure(goldLabels['neg'], predictedLabels['neg'])
    posF = f_measure(goldLabels['pos'], predictedLabels['pos'])

    # Print the accuracy, precisions, recalls, and F-values
    print('accuracy:      ' + str(nltk.classify.util.accuracy(classifier, testfeats)))
    print('pos precision: ' + str(posPrecision))
    print('pos recall:    ' + str(posRecall))
    print('neg precision: ' + str(negPrecision))
    print('neg recall:    ' + str(negRecall))
    print('neg F-measure: ' + str(negF))
    print('pos F-measure: ' + str(posF))

    # Print the most informative features
    classifier.show_most_informative_features(n=numFeats)

In [23]:
print("=== AMAZON ===")
runNBTest("amazon-pos.txt", "amazon-neg.txt", 0.8, 25)

print("\n")

print("=== GOOGLE ===")
runNBTest("google-pos.txt", "google-neg.txt", 0.8, 25)

=== AMAZON ===
train on 800 instances, test on 200 instances
accuracy:      0.89
pos precision: 0.90625
pos recall:    0.87
neg precision: 0.875
neg recall:    0.91
neg F-measure: 0.892156862745098
pos F-measure: 0.8877551020408163
Most Informative Features
              ('Great',) = True              pos : neg    =     40.3 : 1.0
               ('nice',) = True              pos : neg    =     13.0 : 1.0
              ('smart',) = True              pos : neg    =     12.3 : 1.0
         ('people', ',') = True              pos : neg    =     11.7 : 1.0
              ('learn',) = True              pos : neg    =     11.0 : 1.0
      ('opportunities',) = True              pos : neg    =      9.8 : 1.0
           ('benefits',) = True              pos : neg    =      9.7 : 1.0
         ('to', 'learn') = True              pos : neg    =      9.0 : 1.0
            ('balance',) = True              neg : pos    =      8.8 : 1.0
                ('Not',) = True              neg : pos    =      7.