# EECS 487 Project: Naive Bayes Classifier of Sentiment Analysis of Contraceptives

This file contains the ipynb of our project. In the folder, you can find our dataset, our README, and the .py file for our Naive Bayes code.

In [51]:
!pip install nltk --upgrade
import nltk
print(nltk.__version__)

3.8.1


In [52]:
%load_ext autoreload
%autoreload 2

import pickle
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

from naive_bayes import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading the dataset

In [70]:
from naive_bayes import *

all_data = load_reviews('reviews.csv')

(train, test) = train_test_split(all_data, train_size=0.7)

display(train)
display(test)

Unnamed: 0,ratings,reviews
12628,5,I have been on this pill for only a little ove...
2810,5,"Have taken other oral contraceptives before, b..."
13386,1,This medication was the cause of a bilateral p...
1928,5,I have been on Sprintec for the past 5 years a...
2496,5,I've been on birth control for seven years now...
...,...,...
3873,5,I had Paragard inserted after a bad experience...
6878,3,I took this drug for a year and now after read...
3633,5,I only tried this out at the recommendation of...
7626,3,I have been on this pill for 3 months and just...


Unnamed: 0,ratings,reviews
6578,2,Horrible! My OB switched me to this after I fi...
930,4,I have never had any children or been pregnant...
11839,1,I have been on this medication for 2 weeks and...
6545,4,I've been on birth control pills for more than...
7697,2,While on this medication I had frequent breakt...
...,...,...
8678,5,I've missed 4 pills in a row a few times and d...
501,5,I had my IUD inserted six weeks after my secon...
6248,5,I have been using birth control for 5 years an...
12294,3,Aubra was a bad first experience with birth-co...


## Dataset statistics
- Average number of tokens per review
- Standard deviation of the number of tokens per review
- Total number of negative (0) and positive (1) reviews

In [71]:
get_basic_stats(all_data)

Average number of tokens per review: 110.13079584775086
Standard deviation: 77.16307487574683
Number of negative/positive reviews: {0: 3537, 1: 10913}


{0: 3537, 1: 10913}

## Data processing and n-gram calculations

In [72]:
naive_bayes = NaiveBayes()
naive_bayes.fit(train)
print(f"Probability for each category: {naive_bayes.category_prob}")
print(f"Length of self.ngram_count: {len(naive_bayes.ngram_count)}")
print(f"Shape of the counts for 1st category: {naive_bayes.ngram_count[0].shape}")
print(f"Number of non-zero terms for 1st category: {(naive_bayes.ngram_count[0] > 0).sum()}")
print(f"Maximum count of the 1st category: {naive_bayes.ngram_count[0].max()}")
print(f"Minimum count of the 1st category: {naive_bayes.ngram_count[0].min()}")
print(f"Sum of ngram count for 1st category: {naive_bayes.ngram_count[0].sum()}")
print(f"Total count for each category: {naive_bayes.total_count}")

Probability for each category: [0.24597133 0.75402867]
Length of self.ngram_count: 2
Shape of the counts for 1st category: (46409,)
Number of non-zero terms for 1st category: 35453
Maximum count of the 1st category: 7019.0
Minimum count of the 1st category: 0.0
Sum of ngram count for 1st category: 367317.0
Total count for each category: [ 367317. 1192434.]


## Calculating posterior probability for a category

In [73]:
test_docs = ["I was in awful pain and had uncontrollable vomiting for hours, with severe cramping and headaches",
 "The insertion was not painful at all and I loved the convenience and ease of use"]
prob1 = naive_bayes.calculate_prob(test_docs, 0)
prob2 = naive_bayes.calculate_prob(test_docs, 1)
print(f"Probability for category 0: {prob1}")
print(f"Probability for category 1: {prob2}")

Probability for category 0: [-192.50996749 -226.49034744]
Probability for category 1: [-196.26388209 -214.61763601]


## Predicting labels for new reviews

In [74]:
preds = naive_bayes.predict(test_docs)
print(f"Prediction: {preds}")

Prediction: [0, 1]


## Calculating evaluation metrics

In [68]:
predictions = [1,1,0,1,0,0,1]
labels = [1,0,0,1,0,1,1]
accuracy, mac_f1, mic_f1 = evaluate(predictions, labels)
print(f"Accuracy: {accuracy}")
print(f"Macro f1: {mac_f1}")
print(f"Micro f1: {mic_f1}")

Accuracy: 0.7142857142857143
Macro f1: 0.7083333333333333
Micro f1: 0.7142857142857143


## Evaluating on test data

In [69]:
predictions = naive_bayes.predict(test['reviews'])
labels = test['ratings']
print(predictions)
def ratingConvert(number: int):
    return math.floor(number / 4)
labels = (labels.apply(ratingConvert)).tolist()
print(labels)
accuracy, mac_f1, mic_f1 = evaluate(predictions, labels)
print(f"Accuracy: {accuracy}")
print(f"Macro f1: {mac_f1}")
print(f"Micro f1: {mic_f1}")

[1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 