To save time, we use pre-trained model to help us label the sentiment for reviews automatically.

Import Library

In [51]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [52]:
import pandas as pd
import re
from statistics import mean

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

Read Data

In [53]:
df_reviews = pd.read_json('../../data/final/reviews.json.gz', orient="records", compression="gzip")

In [54]:
df_reviews.head()

Unnamed: 0,user_id,product_id,ratings,review_text,summary,created_at
0,A0203183BAH3TR08FZGB,B0043T7FHK,5,I got this to run as a dual monitor. This is ...,This is my second time purchasing this monitor...,2015-06-30
1,A0261431Y0V4MHWY4B7W,B00AFH2E8E,4,"Not as good as I had hoped, music is very low,...",Bluetooth headset,2014-08-03
2,A034116598G557EYZ9BC,B0013FRNKG,5,Appreciate if product\nNeed to buy one more if...,great value,2012-11-28
3,A0404374X0HL5T332XSN,B00MNOPS1C,3,You get what you pay for,Three Stars,2016-02-02
4,A0431622H67YR5IPJRN,B0058UUR6E,5,Arrived in 2 days. working great. Recommend to...,working great. Recommend to others,2015-03-11


Data Preprocessing

In [55]:
def remove_html_tags(text: str):
    return re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', text)

def remove_url(text: str):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def clean_data(text: str):
    text = remove_html_tags(text)
    text = remove_url(text)
    return text

In [56]:
df_reviews['concatenated_review'] = df_reviews[['review_text', 'summary']].apply(
    lambda x : " ".join(str(y) for y in x if str(y) != 'nan'),
    axis=1
)

df_preprocessing_data = df_reviews[['concatenated_review']].rename(columns = {"concatenated_review" : "review_text"})

In [57]:
df_preprocessing_data['cleaned_review'] = df_preprocessing_data['review_text'].apply(clean_data)

In [58]:
df_preprocessing_data.head()

Unnamed: 0,review_text,cleaned_review
0,I got this to run as a dual monitor. This is ...,I got this to run as a dual monitor. This is ...
1,"Not as good as I had hoped, music is very low,...","Not as good as I had hoped, music is very low,..."
2,Appreciate if product\nNeed to buy one more if...,Appreciate if product\nNeed to buy one more if...
3,You get what you pay for Three Stars,You get what you pay for Three Stars
4,Arrived in 2 days. working great. Recommend to...,Arrived in 2 days. working great. Recommend to...


Label Data

In [59]:
sia = SentimentIntensityAnalyzer()

def get_sentiment_matrix(text: str):
    scores = [
        sia.polarity_scores(sentence)["compound"] for sentence in sent_tokenize(text)
    ]

    compound_score = mean(scores)

    sentiment = 'Neutral'
    sentiment_score = 0
    
    if compound_score >= 0.05:
        sentiment = 'Positive'
        sentiment_score = 1
    elif compound_score <= -0.05:
        sentiment = 'Negative'
        sentiment_score = -1
    
    return pd.Series([compound_score, sentiment, sentiment_score])

In [60]:
df_preprocessing_data[[
    'compound_score',
    'sentiment',
    'sentiment_score'
]] = df_preprocessing_data["cleaned_review"].apply(lambda x: get_sentiment_matrix(x))

In [61]:
df_preprocessing_data.head(50)

Unnamed: 0,review_text,cleaned_review,compound_score,sentiment,sentiment_score
0,I got this to run as a dual monitor. This is ...,I got this to run as a dual monitor. This is ...,0.301557,Positive,1
1,"Not as good as I had hoped, music is very low,...","Not as good as I had hoped, music is very low,...",-0.052,Negative,-1
2,Appreciate if product\nNeed to buy one more if...,Appreciate if product\nNeed to buy one more if...,0.8874,Positive,1
3,You get what you pay for Three Stars,You get what you pay for Three Stars,-0.1027,Negative,-1
4,Arrived in 2 days. working great. Recommend to...,Arrived in 2 days. working great. Recommend to...,0.39444,Positive,1
5,VERY GOOD!!!! Five Stars,VERY GOOD!!!! Five Stars,0.0,Neutral,0
6,we love it Five Stars,we love it Five Stars,0.6369,Positive,1
7,"I was disappointed in the video quality, and s...","I was disappointed in the video quality, and s...",0.00922,Neutral,0
8,"Love this camera, hard to take a bad picture. ...","Love this camera, hard to take a bad picture. ...",0.30298,Positive,1
9,EXCELLENT Five Stars,EXCELLENT Five Stars,0.6633,Positive,1


In [62]:
print(df_preprocessing_data["sentiment"].value_counts())

Positive    48619
Neutral      5620
Negative     5322
Name: sentiment, dtype: int64


In [63]:
df_preprocessing_data.to_json('../data/labelled_reviews.json.gz', compression="gzip", orient="records", indent=2)