In [1]:
# Run this every time you open the spreadsheet
%load_ext autoreload
%autoreload 2
from collections import Counter
import lib
import random

# Load and Inspect the Data

In [2]:
# Load the data.
# This function returns reviews and test_reviews, both lists of reviews
reviews, test_reviews = lib.read_data()

# Train a Naive Bayes Classifier

In [3]:
prob_positive, token_probs_positive = lib.calc_probs(reviews, "positive")
prob_negative, token_probs_negative = lib.calc_probs(reviews, "negative")

### Most discriminative words

In [4]:
# For each sentiment c, print out the tokens that maximize P(c|token)

prior_probs = {'positive': prob_positive, 'negative': prob_negative}
token_probs = {'positive': token_probs_positive, 'negative': token_probs_negative}

lib.most_discriminative(reviews, token_probs, prior_probs)

MOST DISCRIMINATIVE TOKEN: 

TOKEN                P(positive|token)
hears                1.0000
sc                   1.0000
prefect              1.0000
antioxidents         1.0000
yummm                1.0000
mmmmmmm              1.0000
winters              1.0000
bil                  1.0000
dances               1.0000
jac                  1.0000

TOKEN                P(negative|token)
uneatable            0.9999
implicated           0.9999
refundable           0.9999
martek               0.9999
unhelpful            0.9998
notation             0.9998
poorest              0.9998
nonreturnable        0.9998
tampering            0.9998
readable             0.9998



# Test the Naive Bayes Classifier

### Prediction results

In [5]:
# Generate a list of (review, prediction) pairs
predictions = [(review, lib.classify_nb(review, prior_probs, token_probs)) for review in test_reviews]

In [6]:
# Compare true labels and predicted labels in a table
lib.show_predictions(predictions)

Unnamed: 0,Content,True sentiment,Predicted sentiment
0,great book perfect condition arrived short amount time long expected delivery date,positive,positive
1,amazing little popularity diamond pet foods seems considering quality ingredients foods contain worked pet store four years number one recommendations foods natural balance wellness two exceptional foods pretty expensive owners multiple dogs even one large breed dog brands cripple budget would get customers absolutely loved pets wanted able feed healthy food without breaking banks recommended diamond diamond dog food costs commercial brands like purina iams contains natural healthy ingredients remember less garbage feed dog products fillers less dog uses bathroom stay fuller longer,positive,positive
2,author wrote wild things carol king wrote great song matches lyrics illustrations fabulous wish could buy hardbound larger tiny book easily misplaced year old carries everywhere,positive,positive
3,purchased medication wellbutrin completely stopped everything pretty drastically despite situation still experienced alot cramping greater part next day drinking tea bed something not think would happen chamomile etc tea suppose worked uncomfortable probably use situation free day home course would not recommend someone mild issues,negative,negative
4,vine offered conditioner looking replace brand fine fragile medium length hair dye permanent dye every two months never use blow dryer would dry hair much still problems breakage trying find conditioner need leaves hair soft smooth tangle free almost not comb washing neither thick thin greasy nor watery easy apply rub little bit rinse two minutes fruity scent rather strong pleasant fade little soon application not know reviewers say conditioner salon price city live conditioner drugstore price,positive,positive


### F1 score

In [7]:
# Get average F1 score for the test set
lib.evaluate(predictions)

positive
Precision:  90.01181884477079
Recall:  91.95180846691328
F1:  90.97147213927187

negative
Precision:  76.85260435906908
Recall:  72.36677334075414
F1:  74.5422623526461

Average F1:  82.75686724595899


In [8]:
# Get average F1 score for the TRAINING set.
# Compare with average F1 for test set above. What's the reason for the difference?

trainset_predictions = [(review, lib.classify_nb(review, prior_probs, token_probs)) for review in reviews] # maps each training tweet to its predicted label
lib.evaluate(trainset_predictions)

positive
Precision:  99.84717269485482
Recall:  99.62745867622363
F1:  99.7371946820571

negative
Precision:  99.00392237819983
Recall:  99.58986605752258
F1:  99.29602981520783

Average F1:  99.51661224863247


### Confusion matrix

In [9]:
lib.show_confusion_matrix(predictions)

Unnamed: 0,positive,negative
positive,35795,3133
negative,3972,10402


### Visualizing individual tweets

In [10]:
random_review = random.choice(list(test_reviews))
lib.visualize_review(random_review, prior_probs, token_probs)