##### Library

In [1]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

import pandas as pd
import re
from statistics import mean

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


##### Load Data

In [2]:
df_reviews = pd.read_json('../../data/interim/final/reviews.json.gz', orient="records", compression="gzip")

In [3]:
df_reviews.head()

Unnamed: 0,user_id,product_id,ratings,review_text,summary,created_at
0,A0203183BAH3TR08FZGB,B0043T7FHK,5,I got this to run as a dual monitor. This is ...,This is my second time purchasing this monitor...,2015-06-30
1,A0261431Y0V4MHWY4B7W,B00AFH2E8E,4,"Not as good as I had hoped, music is very low,...",Bluetooth headset,2014-08-03
2,A034116598G557EYZ9BC,B0013FRNKG,5,Appreciate if product\nNeed to buy one more if...,great value,2012-11-28
3,A0404374X0HL5T332XSN,B00MNOPS1C,3,You get what you pay for,Three Stars,2016-02-02
4,A0431622H67YR5IPJRN,B0058UUR6E,5,Arrived in 2 days. working great. Recommend to...,working great. Recommend to others,2015-03-11


##### Data Preprocessing

In [4]:
def remove_html_tags(text: str):
    return re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', text)

def remove_url(text: str):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def clean_data(text: str):
    text = remove_html_tags(text)
    text = remove_url(text)
    return text

In [5]:
df_preprocessing_data = df_reviews[['user_id', 'product_id', 'review_text', 'summary']].copy()

In [6]:
df_preprocessing_data['concatenated_review'] = df_preprocessing_data['review_text'] + ' ' + df_preprocessing_data['summary']

df_preprocessing_data = df_preprocessing_data[['user_id', 'product_id', 'concatenated_review']]

In [7]:
df_preprocessing_data['cleaned_review'] = df_preprocessing_data['concatenated_review'].apply(clean_data)

df_preprocessing_data = df_preprocessing_data[['user_id', 'product_id', 'cleaned_review']]

In [8]:
df_preprocessing_data.head()

Unnamed: 0,user_id,product_id,cleaned_review
0,A0203183BAH3TR08FZGB,B0043T7FHK,I got this to run as a dual monitor. This is ...
1,A0261431Y0V4MHWY4B7W,B00AFH2E8E,"Not as good as I had hoped, music is very low,..."
2,A034116598G557EYZ9BC,B0013FRNKG,Appreciate if product\nNeed to buy one more if...
3,A0404374X0HL5T332XSN,B00MNOPS1C,You get what you pay for Three Stars
4,A0431622H67YR5IPJRN,B0058UUR6E,Arrived in 2 days. working great. Recommend to...


##### Label Data

In [9]:
sia = SentimentIntensityAnalyzer()

def get_sentiment_matrix(text: str):
    #scores = [sia.polarity_scores(sentence)["compound"] for sentence in sent_tokenize(text)]

    #compound_score = mean(scores)
    compound_score = sia.polarity_scores(text)["compound"]
    
    sentiment = 0

    if compound_score >= 0.05:
        sentiment = 1
    elif compound_score <= -0.05:
        sentiment = -1
    
    return pd.Series([compound_score, sentiment])

In [10]:
df_preprocessing_data[[
    'compound_score',
    'sentiment'
]] = df_preprocessing_data["cleaned_review"].apply(lambda x: get_sentiment_matrix(x))

df_preprocessing_data[['sentiment']] = df_preprocessing_data[['sentiment']].astype('Int64')

In [11]:
df_preprocessing_data.head()

Unnamed: 0,user_id,product_id,cleaned_review,compound_score,sentiment
0,A0203183BAH3TR08FZGB,B0043T7FHK,I got this to run as a dual monitor. This is ...,0.937,1
1,A0261431Y0V4MHWY4B7W,B00AFH2E8E,"Not as good as I had hoped, music is very low,...",-0.104,-1
2,A034116598G557EYZ9BC,B0013FRNKG,Appreciate if product\nNeed to buy one more if...,0.8874,1
3,A0404374X0HL5T332XSN,B00MNOPS1C,You get what you pay for Three Stars,-0.1027,-1
4,A0431622H67YR5IPJRN,B0058UUR6E,Arrived in 2 days. working great. Recommend to...,0.9217,1


In [12]:
print(df_preprocessing_data["sentiment"].value_counts())

1     50394
-1     6850
0      2317
Name: sentiment, dtype: Int64


In [13]:
df_export = df_reviews.merge(
    df_preprocessing_data.drop(columns=['cleaned_review', 'compound_score']), 
    how='inner', 
    on=['user_id', 'product_id'], 
    validate="one_to_one"
)

In [14]:
df_export.head()

Unnamed: 0,user_id,product_id,ratings,review_text,summary,created_at,sentiment
0,A0203183BAH3TR08FZGB,B0043T7FHK,5,I got this to run as a dual monitor. This is ...,This is my second time purchasing this monitor...,2015-06-30,1
1,A0261431Y0V4MHWY4B7W,B00AFH2E8E,4,"Not as good as I had hoped, music is very low,...",Bluetooth headset,2014-08-03,-1
2,A034116598G557EYZ9BC,B0013FRNKG,5,Appreciate if product\nNeed to buy one more if...,great value,2012-11-28,1
3,A0404374X0HL5T332XSN,B00MNOPS1C,3,You get what you pay for,Three Stars,2016-02-02,-1
4,A0431622H67YR5IPJRN,B0058UUR6E,5,Arrived in 2 days. working great. Recommend to...,working great. Recommend to others,2015-03-11,1


Export as Json

In [15]:
df_export.to_json('../../data/interim/final/labelled_reviews_by_models.json.gz', compression="gzip", orient="records", indent=2)