### CAPSTONE PROJECT: NLP APPLICATIONS
#### TITLE: ANALYSIS OF CONSUMER REVIEWS OF AMAZON PRODUCTS USING SENTIMENT ANALYSIS

#### Import the libraries

In [1]:
import pandas as pd
import spacy


#### Load the dataset

In [2]:
product_reviews = pd.read_csv('amazon_product_reviews.csv')

# Checking the dataset, column names, types of data
product_reviews.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.didPurchase,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht..."
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht..."
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht..."
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht..."
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht..."


In [3]:
# Checking the size of our dateset, number of rows and columns
product_reviews.shape

(28332, 24)

In [4]:
# selecting the column that we need for our analysis and removing missing values
reviews_data = product_reviews['reviews.text'].dropna()


In [5]:
reviews_data.shape

(28332,)

In [6]:
reviews_data.head()

0    I order 3 of them and one of the item is bad q...
1    Bulk is always the less expensive way to go fo...
2    Well they are not Duracell but for the price i...
3    Seem to work as well as name brand batteries a...
4    These batteries are very long lasting the pric...
Name: reviews.text, dtype: object

### Defining the NLP Preprocessing function

In [7]:
# load the spacy language model
nlp = spacy.load('en_core_web_md')

In [8]:
# we'll remove stop words, punctuations and reduce words to their basic forms using tokenization and lemmatization
def preprocess(reviews):
    doc = nlp(reviews)
    preprocessed_text = ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])
    return str(preprocessed_text)


### Creating a function for sentiment analysis

In [9]:
# We'll use textblob for the analysis
from textblob import TextBlob


In [11]:

# Defining a sentiment analysis function
def sentiment_review(text):
    doc_x = preprocess(text)
    
    # Analysing sentiment using TextBlob
    blob = TextBlob(doc_x)

    polarity_score = blob.sentiment.polarity  # Getting the sentiment score using TextBlob
    print(polarity_score)
    
    # Determining the sentiment label
    if polarity_score == 1:
        sentiment = 'Very Positive'
    elif polarity_score > 0:
        sentiment = "Positive"
    elif polarity_score < 0:
        sentiment = "Negative"
    elif polarity_score == -1:
        sentiment = 'Very Negative'
    else:
        sentiment = "Neutral"

    return sentiment, polarity_score

    

### Evaluation and Examples

In [12]:
text =  "I bought this phone a week ago and I'm extremely disappointed. The battery life is terrible, and the camera quality is below average. I would not recommend it to anyone."
sentiment, polarity_score = sentiment_review(text)
print(f'Sentiment: {sentiment}')
print(f'Polarity Score: {polarity_score}')

-0.6333333333333333
Sentiment: Negative
Polarity Score: -0.6333333333333333


In [13]:
# Texting our function on some reviews from the reviews.text column
sample_1 = reviews_data.iloc[0]
sample_2 = reviews_data.iloc[850]

In [14]:
sentiment_review(sample_1)

-0.6999999999999998


('Negative', -0.6999999999999998)

In [15]:
sentiment_review(sample_2)

0.053750000000000006


('Positive', 0.053750000000000006)

#### More evaluation by printing the reviews to check the contents

In [16]:
print(sample_1)
print(preprocess(sample_1))

I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.
order 3 item bad quality miss backup spring pc aluminum battery work


In [17]:
print(sample_2)
print(preprocess(sample_2))

While most, modern day batteries no longer contain mercury, the Amazon basics battery line unfortunately does. Which means they're bad for the environment, and not able to be disposed of other than in appropriate hazardous waste. I'll spend the extra few dollars and buy a name brand
modern day battery long contain mercury amazon basic battery line unfortunately mean bad environment able dispose appropriate hazardous waste spend extra dollar buy brand


#### Apply the model to user selected reviews


In [18]:
# Get the user input and make it error proof by requesting the input again if the number is out of range
# or if there is a value error.

while True:
    try:
        idx = int(input("Please enter the index of the review you'll like to test (0 to 28331): "))
        if idx < 0 or idx >= len(product_reviews['reviews.text']):
            raise IndexError("Index out of range, maximum range is 28331")
              
        review = preprocess(product_reviews['reviews.text'][idx])
        sentiment, polarity_score = sentiment_review(review)

        print(f'You\'ve entered index {idx}')
        print(f'Sentiment: {sentiment}')
        print(f'Polarity Score: {polarity_score}')

        break

    except ValueError:
        print("Please enter a valid integer index from 0 to 28331")
    except IndexError as e:
        print(e)

0.38775510204081637
You've entered index 28331
Sentiment: Positive
Polarity Score: 0.38775510204081637


### Comparing Reviews for Similarities

In [19]:
# First we build a function to do the similarity comparison

def find_similarity(text1, text2):
    doc_1 = nlp(text1)
    doc_2 = nlp(text2)

    similarity_score = doc_1.similarity(doc_2)

    if similarity_score == 1:
        similarity_label = "High similarity: The two reviews are identical or nearly the same"
    elif similarity_score == 0:
        similarity_label = "The two reviews are not similar"
    elif similarity_score <= 0.5 > 0:
        similarity_label = "Low similarity"
    elif similarity_score <= 0.7 > 0.5:
        similarity_label = "Medium similarity: some parts of the texts are similar"
    elif similarity_score < 1 > 0.7:
        similarity_label = "Strong similarity: The two reviews are very similar"

    return similarity_label, similarity_score




In [21]:
# Application of the similarity function
# Prompt the user to enter the indexes of the reviews they would like to compare, raise error if the indexes are
# out of range or if a value error accur.


while True:
    try:
        idx1 = int(input("Please enter index of review 1: "))
        if idx1 < 0 or idx1 >= len(product_reviews['reviews.text']):
            raise IndexError("Index out of range")
           
        idx2 = int(input("Please enter index of review 2: "))
        if idx2 < 0 or idx2 >= len(product_reviews['reviews.text']):
            raise IndexError("Index out of range")
            
        review_1 = preprocess(product_reviews['reviews.text'][idx1])
        review_2 = preprocess(product_reviews['reviews.text'][idx2])
        similarity_label, similarity_score = find_similarity(review_1, review_2)

        print(f'You\'ve entered index {idx1} and index {idx2}')
        print(f'Similarity: {similarity_label}')
        print(f'Similarity Score: {similarity_score}')

        break

    except ValueError:
        print("Please enter a valid integer index from 0 to 28331")
    except IndexError as e:
        print(e)
    

You've entered index 0 and index 850
Similarity: Strong similarity: The two reviews are very similar
Similarity Score: 0.7640639853395023
