In [1]:
import json
import nltk
import pandas as pd
import spacy
import string

from collections import namedtuple
from nltk.corpus import wordnet as wn, sentiwordnet as swn, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

# Data Reading

In [2]:
# Set True to execute code by using Google Colab
# Set False to execute code by using local computer
USE_COLAB=False

In [3]:
# If Google Colab is used, add google.colab library
if USE_COLAB:
  from google.colab import drive

In [4]:
# If Google Colab is used, mount Google Drive to Colab System
if USE_COLAB:
  drive.mount('/content/gdrive/')

In [5]:
# Define the root path for working directory
root_path = '/content/gdrive/MyDrive/Master-Thesis/master-thesis-sentiment-analysis' if USE_COLAB else '.'

In [6]:
data_path = f"{root_path}/datasets/temp/Amazon Reviews 2023/Sports_and_Outdoors.jsonl"

In [7]:
data_saved_path = f"{root_path}/datasets/temp/Amazon Reviews 2023"

In [8]:
with open(data_path, 'r') as fs:
    first_review = [json.loads(next(fs).strip()) for _ in range(1)]
first_review

[{'rating': 5.0,
  'title': 'Crazy comfy!',
  'text': 'Not gonna lie- they are not much to look at. Lol. Luckily I’m one of those ppl that values things for function over looks & these function well so far. They are seriously one of the most comfortable pairs of socks I’ve owned in 5 decades.  I have not tried to wash them yet, so fingers crossed on that rn.  They feel very cushiony.  I wear them in my winter boots & just on my feet shoeless around my home.  I wish they came in more colors.  I’m one of those ppl that absolutely cannot stand toe seams on socks, but these have not bothered me at all.  I have super high arches so the only change I would make to the socks would be some compression there.  However, the socks fit perfectly as-is which really surprised me given my arches.  I just like having compression at my arches bc it feels good on them.  I wear a ladies 10-1/2 shoe- mens 8-1/2 and I bought the medium socks. They fit perfectly.  That’s never happened.  I had honestly expe

In [9]:
def read_reviews():
    temp = []
    with open(data_path, 'r') as fs:
        for line in fs:
            json_text = json.loads(line.strip())
            temp.append((json_text["title"], json_text["text"], json_text["rating"]))
    
    return pd.DataFrame(temp, columns =["review_title", "review_text", "rating"])

In [10]:
%%time
df = read_reviews()

CPU times: total: 4min 47s
Wall time: 6min 25s


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 3 columns):
 #   Column        Dtype  
---  ------        -----  
 0   review_title  object 
 1   review_text   object 
 2   rating        float64
dtypes: float64(1), object(2)
memory usage: 448.5+ MB


In [12]:
df.isnull().sum()

review_title    0
review_text     0
rating          0
dtype: int64

In [13]:
df.head()

Unnamed: 0,review_title,review_text,rating
0,Crazy comfy!,Not gonna lie- they are not much to look at. L...,5.0
1,Excellent!,I love it. Pretty!,5.0
2,Best saddle pads,Huge fan of B Vertigo and this dressage pad do...,5.0
3,Perfect repair kit,"I have a great Weaver halter. Recently, the Ch...",5.0
4,Works great,This was great for a slightly too-short girth!...,5.0


In [14]:
df["rating"].value_counts()

rating
5.0    12981998
4.0     2518170
1.0     1836990
3.0     1324911
2.0      933101
Name: count, dtype: int64

# Data Cleansing

In [15]:
# Fix missing values

In [16]:
df.dropna(how="all", inplace=True)

In [17]:
df.dropna(how="all",axis=1, inplace=True)

In [18]:
df.fillna({"review_title": "", "review_text": ""}, inplace=True)

In [19]:
# Combine "review_title" and "review_text" columns into Text

In [20]:
df["review"] = (df["review_title"].str.rstrip('.!? \n\t') +  ". " +  df["review_text"]).str.lstrip('.!? \n\t')

In [21]:
# Remove 'review_title' and 'review_text' columns

In [22]:
df.drop(columns=["review_title", "review_text"], inplace=True)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   rating  float64
 1   review  object 
dtypes: float64(1), object(1)
memory usage: 299.0+ MB


In [24]:
df.head()

Unnamed: 0,rating,review
0,5.0,Crazy comfy. Not gonna lie- they are not much ...
1,5.0,Excellent. I love it. Pretty!
2,5.0,Best saddle pads. Huge fan of B Vertigo and th...
3,5.0,Perfect repair kit. I have a great Weaver halt...
4,5.0,Works great. This was great for a slightly too...


# Text Processing

In [25]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [26]:
sentiment_analyzer = SentimentIntensityAnalyzer()

In [27]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [28]:
def clean_review(document):
    document = document.lower().translate(str.maketrans('', '', string.punctuation))    
    tokens = [token.lemma_ for token in nlp(document) if len(token.lemma_) >= 3 and not token.is_stop]
    return " ".join(tokens)

In [29]:
def calcualte_sentiment_score(document):
    return sentiment_analyzer.polarity_scores(document)["compound"]

In [30]:
def detect_sentiment(sentiment_score):
    return "positive" if sentiment_score >= 0.5 else "negative" if sentiment_score <= -0.5 else "neutral"

In [31]:
df_test = df.head(10000).copy()

In [32]:
%%time
df_test["review_cleaned"] = df_test["review"].apply(clean_review)

CPU times: total: 2min 17s
Wall time: 2min 17s


In [33]:
df_test["sentiment_score"] = df_test["review"].apply(calcualte_sentiment_score)

In [34]:
df_test["sentiment"] = df_test["sentiment_score"].apply(detect_sentiment)

In [35]:
df_test["sentiment_score_cleaned"] = df_test["review_cleaned"].apply(calcualte_sentiment_score)

In [36]:
df_test["sentiment_cleaned"] = df_test["sentiment_score_cleaned"].apply(detect_sentiment)

In [37]:
df_test.head(20)

Unnamed: 0,rating,review,review_cleaned,sentiment_score,sentiment,sentiment_score_cleaned,sentiment_cleaned
0,5.0,Crazy comfy. Not gonna lie- they are not much ...,crazy comfy lie look lol luckily ppl value thi...,0.9974,positive,0.9961,positive
1,5.0,Excellent. I love it. Pretty!,excellent love pretty,0.908,positive,0.9022,positive
2,5.0,Best saddle pads. Huge fan of B Vertigo and th...,good saddle pad huge fan vertigo dressage pad ...,0.9143,positive,0.8298,positive
3,5.0,Perfect repair kit. I have a great Weaver halt...,perfect repair kit great weaver halter recentl...,0.9576,positive,0.9485,positive
4,5.0,Works great. This was great for a slightly too...,work great great slightly tooshort girth sturd...,0.9323,positive,0.9246,positive
5,5.0,Great stirrups with bar none grip. I have to s...,great stirrup bar grip grip pretty great great...,0.9348,positive,0.9313,positive
6,5.0,Perfect. Perfect for my boy! He looks adorable...,perfect perfect boy look adorable love pad ins...,0.9549,positive,0.9413,positive
7,5.0,Awesome leathers. These are amazing. My traine...,awesome leather amazing trainer notice right a...,0.827,positive,0.9451,positive
8,4.0,"Nice product, sizing off. Product is well made...",nice product size product size way woman purch...,0.9579,positive,0.8519,positive
9,4.0,"Mixed feelings. I loved this chalk bag online,...",mix feeling love chalk bag online person like ...,0.9947,positive,0.9912,positive


In [38]:
df_test["sentiment"].value_counts()

sentiment
positive    7750
neutral     1818
negative     432
Name: count, dtype: int64

In [39]:
df_test["sentiment_cleaned"].value_counts()

sentiment_cleaned
positive    7852
neutral     1902
negative     246
Name: count, dtype: int64

# Save Cleaned Data

In [41]:
df_test.to_csv(f"{root_path}/datasets/temp/Amazon Reviews 2023/Reviews_With_Sentiment.csv")