# Project Overview

For this project we'll be evaluating a bunch of product reviews on Amazon


We will try to predict whether a review is considered negative or positive

# Importing Basic Libraries

In [1]:
# These are the libraries I typically use in my analysis so I find it easier to import them all at once
# If I need more libraries I will import them as needed

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline

# Importing the Dataset

In [2]:
# Our dataset is amazonreviews.tsv, where the tsv stands for "tab separated variables"
# Hence in order to import the file correctly we need to add delimiter = "\t"
# We will name the dataframe "amazon"

amazon =  pd.read_csv('amazonreviews.tsv', delimiter = '\t')

In [3]:
# Here is a brief look at the dataset

amazon.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [4]:
# There are 10,000 Amazon reviews in our dataset

amazon.shape

(10000, 2)

In [5]:
# We do not have any missing values in our dataset

amazon.isnull().sum()

label     0
review    0
dtype: int64

In [6]:
# Looks like we have almost a perfectly symmetrical breakdown of positive and negative reviews after the cleaning

amazon['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

# Run First Review Through VADER

In [7]:
import nltk

# Download the VADER lexicon. You only need to do this once.
# nltk.download('vader_lexicon')

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [9]:
# According to the polarity score we have low positivity, but high neutrality and almost no negativity
# The compound score comes to .9454, which is pretty high

sid.polarity_scores(amazon.loc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [10]:
# The first review is labelled as positive, which should correspond to a high compound polarity score

amazon.loc[0]['label']

'pos'

# Adding Scores and Labels to the DataFrame

In this next section we'll add columns to the original DataFrame to store polarity_score dictionaries, extracted compound scores, and new "pos/neg" labels derived from the compound score. We'll use this last column to perform an accuracy test.

In [11]:
amazon['scores'] = amazon['review'].apply(lambda review: sid.polarity_scores(review))

# So now we have a new column "scores" that has the polarity score for each review
amazon.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [12]:
amazon['compound'] = amazon['scores'].apply(lambda scores_dict: scores_dict['compound'])

# So now we have a new column "compound" that is just the compound polarity score for each review 
amazon.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [13]:
amazon['comp_score'] = amazon['compound'].apply(lambda score: 'pos' if score >=0 else 'neg')

# So now we have a comp score that is positive if the compound score > 0 and negative if compound score < 0
# We can compare the comp_score to the actual label of the review
amazon.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


# Report on Accuracy

In [14]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [15]:
# This tells us that VADER correctly identified an Amazon review as "positive" or "negative" roughly 71% of the time.

accuracy_score(amazon['label'], amazon['comp_score'])

0.7097

In [16]:
print(classification_report(amazon['label'], amazon['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [17]:
print(confusion_matrix(amazon['label'], amazon['comp_score']))

[[2629 2468]
 [ 435 4468]]
