# Colab Configuration

In [2]:
# Set True to execute code by using Google Colab
# Set False to execute code by using local computer
USE_COLAB=True

# GPU Configuration

In [3]:
# !nvidia-smi

In [4]:
# if USE_COLAB:
#   !pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12

In [5]:
# If Google Colab is used, add cudf library
# if USE_COLAB:
#   import cudf

In [6]:
# if USE_COLAB:
#   %load_ext cudf.pandas

# Libraries Import

In [7]:
import json
import nltk
import pandas as pd
import spacy
import string

from collections import namedtuple
from nltk.corpus import wordnet as wn, sentiwordnet as swn, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [8]:
pd

<module 'pandas' from '/usr/local/lib/python3.10/dist-packages/pandas/__init__.py'>

# Data Reading

In [9]:
# If Google Colab is used, add google.colab library
if USE_COLAB:
  from google.colab import drive

In [10]:
# If Google Colab is used, mount Google Drive to Colab System
if USE_COLAB:
  drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [11]:
# Define the root path for working directory
root_path = '/content/gdrive/MyDrive/Master-Thesis/master-thesis-sentiment-analysis' if USE_COLAB else '.'

In [12]:
data_path = f"{root_path}/datasets/temp/Amazon Reviews 2023/Sports_and_Outdoors.jsonl"

In [13]:
data_saved_path = f"{root_path}/datasets/temp/Amazon Reviews 2023"

In [14]:
with open(data_path, 'r') as fs:
  first_review = [json.loads(next(fs).strip()) for _ in range(1)]
first_review

[{'rating': 5.0,
  'title': 'Crazy comfy!',
  'text': 'Not gonna lie- they are not much to look at. Lol. Luckily I’m one of those ppl that values things for function over looks & these function well so far. They are seriously one of the most comfortable pairs of socks I’ve owned in 5 decades.  I have not tried to wash them yet, so fingers crossed on that rn.  They feel very cushiony.  I wear them in my winter boots & just on my feet shoeless around my home.  I wish they came in more colors.  I’m one of those ppl that absolutely cannot stand toe seams on socks, but these have not bothered me at all.  I have super high arches so the only change I would make to the socks would be some compression there.  However, the socks fit perfectly as-is which really surprised me given my arches.  I just like having compression at my arches bc it feels good on them.  I wear a ladies 10-1/2 shoe- mens 8-1/2 and I bought the medium socks. They fit perfectly.  That’s never happened.  I had honestly expe

In [15]:
def read_reviews():
  records = []
  with open(data_path, 'r') as fs:
    for line in fs:
      json_text = json.loads(line.strip())
      records.append((json_text["title"], json_text["text"], json_text["rating"]))

  return pd.DataFrame(records, columns =["review_title", "review_text", "rating"])

In [16]:
%%time
df = read_reviews()

CPU times: user 1min 33s, sys: 9.44 s, total: 1min 43s
Wall time: 1min 59s


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 3 columns):
 #   Column        Dtype  
---  ------        -----  
 0   review_title  object 
 1   review_text   object 
 2   rating        float64
dtypes: float64(1), object(2)
memory usage: 448.5+ MB


In [18]:
df.isnull().sum()

review_title    0
review_text     0
rating          0
dtype: int64

In [19]:
df.head()

Unnamed: 0,review_title,review_text,rating
0,Crazy comfy!,Not gonna lie- they are not much to look at. L...,5.0
1,Excellent!,I love it. Pretty!,5.0
2,Best saddle pads,Huge fan of B Vertigo and this dressage pad do...,5.0
3,Perfect repair kit,"I have a great Weaver halter. Recently, the Ch...",5.0
4,Works great,This was great for a slightly too-short girth!...,5.0


In [20]:
df["rating"].value_counts()

rating
5.0    12981998
4.0     2518170
1.0     1836990
3.0     1324911
2.0      933101
Name: count, dtype: int64

# Data Cleansing

In [21]:
# Fix missing values

In [22]:
df.dropna(how="all", inplace=True)

In [23]:
df.dropna(how="all",axis=1, inplace=True)

In [24]:
%%time
df.fillna({"review_title": "", "review_text": ""}, inplace=True)

CPU times: user 7.18 s, sys: 519 ms, total: 7.7 s
Wall time: 7.6 s


In [25]:
# Combine "review_title" and "review_text" columns into Text

In [26]:
%%time
df["review"] = (df["review_title"].str.rstrip('.!? \n\t') +  ". " +  df["review_text"]).str.lstrip('.!? \n\t')

CPU times: user 19.5 s, sys: 3.99 s, total: 23.5 s
Wall time: 23.2 s


In [27]:
# Remove 'review_title' and 'review_text' columns

In [28]:
df.drop(columns=["review_title", "review_text"], inplace=True)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   rating  float64
 1   review  object 
dtypes: float64(1), object(1)
memory usage: 299.0+ MB


In [30]:
df.head()

Unnamed: 0,rating,review
0,5.0,Crazy comfy. Not gonna lie- they are not much ...
1,5.0,Excellent. I love it. Pretty!
2,5.0,Best saddle pads. Huge fan of B Vertigo and th...
3,5.0,Perfect repair kit. I have a great Weaver halt...
4,5.0,Works great. This was great for a slightly too...


# Text Processing

In [31]:
def clean_review(document):
  return document.lower().translate(str.maketrans('', '', string.punctuation))

In [32]:
%%time
df["review_cleaned"] = df["review"].apply(clean_review)

CPU times: user 2min 1s, sys: 2.94 s, total: 2min 4s
Wall time: 2min 4s


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 3 columns):
 #   Column          Dtype  
---  ------          -----  
 0   rating          float64
 1   review          object 
 2   review_cleaned  object 
dtypes: float64(1), object(2)
memory usage: 448.5+ MB


In [34]:
df.head()

Unnamed: 0,rating,review,review_cleaned
0,5.0,Crazy comfy. Not gonna lie- they are not much ...,crazy comfy not gonna lie they are not much to...
1,5.0,Excellent. I love it. Pretty!,excellent i love it pretty
2,5.0,Best saddle pads. Huge fan of B Vertigo and th...,best saddle pads huge fan of b vertigo and thi...
3,5.0,Perfect repair kit. I have a great Weaver halt...,perfect repair kit i have a great weaver halte...
4,5.0,Works great. This was great for a slightly too...,works great this was great for a slightly toos...


# Test

In [35]:
# gpu = spacy.prefer_gpu()
# print('GPU:', gpu)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [36]:
def lemmatize_review(document):
  return " ".join([token.lemma_ for token in nlp(document) if len(token.lemma_) >= 3 and not token.is_stop])

In [37]:
%%time
df["review_cleaned"].head(10_000).apply(lemmatize_review)

CPU times: user 58.9 s, sys: 146 ms, total: 59.1 s
Wall time: 58.9 s


0       crazy comfy lie look lol luckily ppl value thi...
1                                   excellent love pretty
2       good saddle pad huge fan vertigo dressage pad ...
3       perfect repair kit great weaver halter recentl...
4       work great great slightly tooshort girth sturd...
                              ...                        
9995    price right necessarily good quality price har...
9996    decent decent tent set time plenty room star b...
9997    terrible bow bad bow look nice image user frie...
9998    good bike overall good bike assembly pretty ea...
9999    decent lock work use little quality update com...
Name: review_cleaned, Length: 10000, dtype: object

In [38]:
df.head()

Unnamed: 0,rating,review,review_cleaned
0,5.0,Crazy comfy. Not gonna lie- they are not much ...,crazy comfy not gonna lie they are not much to...
1,5.0,Excellent. I love it. Pretty!,excellent i love it pretty
2,5.0,Best saddle pads. Huge fan of B Vertigo and th...,best saddle pads huge fan of b vertigo and thi...
3,5.0,Perfect repair kit. I have a great Weaver halt...,perfect repair kit i have a great weaver halte...
4,5.0,Works great. This was great for a slightly too...,works great this was great for a slightly toos...


In [39]:
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   96
  On-line CPU(s) list:    0-95
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.00GHz
    CPU family:           6
    Model:                85
    Thread(s) per core:   2
    Core(s) per socket:   24
    Socket(s):            2
    Stepping:             3
    BogoMIPS:             4000.35
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 cl
                          flush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc re
                          p_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3
                           fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand
                           hypervisor lahf_lm abm 3dnowprefetc

In [40]:
nlp2 = spacy.load("en_core_web_sm")

In [41]:
nlp2.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [42]:
%%time
_ = [nlp2(t) for t in df["review_cleaned"].head(10_000)]

CPU times: user 2min 5s, sys: 432 ms, total: 2min 5s
Wall time: 2min 5s


In [43]:
%%time
_ = [nlp2(t, disable=["parser", "ner"]) for t in df["review_cleaned"].head(10_000)]

CPU times: user 52.9 s, sys: 326 ms, total: 53.2 s
Wall time: 53.1 s


In [44]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000))]

CPU times: user 1min, sys: 558 ms, total: 1min 1s
Wall time: 1min 1s


In [45]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), batch_size=500)]

CPU times: user 1min 2s, sys: 773 ms, total: 1min 3s
Wall time: 1min 3s


In [46]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), disable=["parser", "ner"])]

CPU times: user 34.4 s, sys: 61.1 ms, total: 34.5 s
Wall time: 34.4 s


In [47]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), disable=["parser", "ner"], batch_size=500)]

CPU times: user 34.8 s, sys: 221 ms, total: 35 s
Wall time: 34.9 s


In [48]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1)]

CPU times: user 9.63 s, sys: 29.5 s, total: 39.1 s
Wall time: 40.8 s


In [49]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, batch_size=500)]

CPU times: user 9.74 s, sys: 30.7 s, total: 40.4 s
Wall time: 41.7 s


In [50]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, batch_size=1_000)]

CPU times: user 10 s, sys: 30.4 s, total: 40.4 s
Wall time: 41.5 s


In [51]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, batch_size=1_500)]

CPU times: user 9.66 s, sys: 32.6 s, total: 42.2 s
Wall time: 43.3 s


In [52]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, batch_size=2_000)]

CPU times: user 10.1 s, sys: 30.4 s, total: 40.5 s
Wall time: 41.5 s


In [53]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, batch_size=2_500)]

CPU times: user 9.93 s, sys: 30.5 s, total: 40.4 s
Wall time: 41.5 s


In [54]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, batch_size=5_000)]

CPU times: user 9.76 s, sys: 36.2 s, total: 45.9 s
Wall time: 56.9 s


In [55]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, disable=["parser", "ner"])]

CPU times: user 10.2 s, sys: 30.4 s, total: 40.6 s
Wall time: 41.7 s


In [56]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, disable=["parser", "ner"], batch_size=500)]

CPU times: user 10.2 s, sys: 31.5 s, total: 41.7 s
Wall time: 42.7 s


In [57]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, disable=["parser", "ner"], batch_size=1_000)]

CPU times: user 9.14 s, sys: 30.5 s, total: 39.6 s
Wall time: 40.5 s


In [58]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, disable=["parser", "ner"], batch_size=1_500)]

CPU times: user 9.53 s, sys: 30.8 s, total: 40.3 s
Wall time: 41.3 s


In [59]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, disable=["parser", "ner"], batch_size=2_000)]

CPU times: user 9.06 s, sys: 30.9 s, total: 39.9 s
Wall time: 40.9 s


In [60]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, disable=["parser", "ner"], batch_size=2_500)]

CPU times: user 9.54 s, sys: 32.2 s, total: 41.8 s
Wall time: 42.7 s


In [61]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, disable=["parser", "ner"], batch_size=5_000)]

CPU times: user 9.11 s, sys: 33.1 s, total: 42.2 s
Wall time: 43.1 s


In [63]:
# use tpu

In [64]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), disable=["parser", "ner"])]

CPU times: user 33.4 s, sys: 443 ms, total: 33.9 s
Wall time: 33.8 s


In [62]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(100_000), disable=["parser", "ner"])]

CPU times: user 5min 7s, sys: 3.16 s, total: 5min 10s
Wall time: 5min 9s


In [66]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(100_000), disable=["parser", "ner"], batch_size=10_000)]

CPU times: user 4min 56s, sys: 57.9 s, total: 5min 54s
Wall time: 5min 53s


In [68]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(100_000), disable=["parser", "ner"], batch_size=50_000)]

CPU times: user 4min 57s, sys: 57.5 s, total: 5min 55s
Wall time: 5min 52s


In [67]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(10_000), n_process=-1, disable=["parser", "ner"])]

CPU times: user 10.9 s, sys: 38.2 s, total: 49.1 s
Wall time: 51 s


In [65]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(100_000), n_process=-1, disable=["parser", "ner"])]

CPU times: user 1min 32s, sys: 40.8 s, total: 2min 12s
Wall time: 2min 14s


In [87]:
%%time
_ = [d for d in nlp2.pipe(df["review_cleaned"].head(100_000), n_process=20, disable=["parser", "ner"], batch_size=1_000)]

CPU times: user 1min 36s, sys: 19.4 s, total: 1min 55s
Wall time: 1min 55s


In [None]:
%%time
test_1 = [d for d in nlp2.pipe(df["review_cleaned"], n_process=-1, disable=["parser", "ner"], batch_size=100_000)]

# Sentiment Score Calculation

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sentiment_analyzer = SentimentIntensityAnalyzer()

In [None]:
def calcualte_sentiment_score(document):
    return sentiment_analyzer.polarity_scores(document)["compound"]

In [None]:
def detect_sentiment(sentiment_score):
    return "positive" if sentiment_score >= 0.5 else "negative" if sentiment_score <= -0.5 else "neutral"

In [None]:
if df is None:
  df = pd.read_csv(f"{data_saved_path}/reviews_cleaned.csv")

In [None]:
df_test = df.head(10000).copy()

In [None]:
%%time
df_test["review_cleaned"] = df_test["review"].apply(clean_review)

In [None]:
df_test["sentiment_score"] = df_test["review"].apply(calcualte_sentiment_score)

In [None]:
df_test["sentiment"] = df_test["sentiment_score"].apply(detect_sentiment)

In [None]:
df_test["sentiment_score_cleaned"] = df_test["review_cleaned"].apply(calcualte_sentiment_score)

In [None]:
df_test["sentiment_cleaned"] = df_test["sentiment_score_cleaned"].apply(detect_sentiment)

In [None]:
df_test.head(20)

In [None]:
df_test["sentiment"].value_counts()

In [None]:
df_test["sentiment_cleaned"].value_counts()

# Save Cleaned Data

In [None]:
df_test.to_csv(f"{data_saved_path}/Reviews_With_Sentiment.csv")