# Libraries Import

In [1]:
import json
import math
import numpy as np
import os
import pandas as pd
import re
import spacy
import string

from datetime import datetime
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

import multiprocessing
from multiprocessing import Manager

In [2]:
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

# Colab Configuration

In [3]:
# Set True to execute code by using Google Colab
# Set False to execute code by using local computer
USE_COLAB=True

In [4]:
# If Google Colab is used, add google.colab library
if USE_COLAB:
  from google.colab import drive

In [5]:
# If Google Colab is used, mount Google Drive to Colab System
if USE_COLAB:
  drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


In [6]:
# Define the root path for working directory
root_path = "/content/gdrive/MyDrive/Master-Thesis/master-thesis-sentiment-analysis" if USE_COLAB else '.'

In [7]:
data_path = f"{root_path}/datasets/temp/Amazon Reviews 2023"

# Data Reading

In [None]:
dataset_path = f"{data_path}/Sports_and_Outdoors.jsonl"

In [None]:
with open(dataset_path, 'r') as fs:
  first_review = [json.loads(next(fs).strip()) for _ in range(1)]

first_review

[{'rating': 5.0,
  'title': 'Crazy comfy!',
  'text': 'Not gonna lie- they are not much to look at. Lol. Luckily I’m one of those ppl that values things for function over looks & these function well so far. They are seriously one of the most comfortable pairs of socks I’ve owned in 5 decades.  I have not tried to wash them yet, so fingers crossed on that rn.  They feel very cushiony.  I wear them in my winter boots & just on my feet shoeless around my home.  I wish they came in more colors.  I’m one of those ppl that absolutely cannot stand toe seams on socks, but these have not bothered me at all.  I have super high arches so the only change I would make to the socks would be some compression there.  However, the socks fit perfectly as-is which really surprised me given my arches.  I just like having compression at my arches bc it feels good on them.  I wear a ladies 10-1/2 shoe- mens 8-1/2 and I bought the medium socks. They fit perfectly.  That’s never happened.  I had honestly expe

In [None]:
def read_reviews():
  records = []
  with open(dataset_path, 'r') as fs:
    for line in fs:
      json_text = json.loads(line.strip())
      records.append((json_text["title"], json_text["text"], json_text["rating"]))

  return pd.DataFrame(records, columns =["review_title", "review_text", "rating"])

In [None]:
%%time
df = read_reviews()

CPU times: user 1min 34s, sys: 9.82 s, total: 1min 44s
Wall time: 2min 16s


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 3 columns):
 #   Column        Dtype  
---  ------        -----  
 0   review_title  object 
 1   review_text   object 
 2   rating        float64
dtypes: float64(1), object(2)
memory usage: 448.5+ MB


In [None]:
df.head()

Unnamed: 0,review_title,review_text,rating
0,Crazy comfy!,Not gonna lie- they are not much to look at. L...,5.0
1,Excellent!,I love it. Pretty!,5.0
2,Best saddle pads,Huge fan of B Vertigo and this dressage pad do...,5.0
3,Perfect repair kit,"I have a great Weaver halter. Recently, the Ch...",5.0
4,Works great,This was great for a slightly too-short girth!...,5.0


In [None]:
df["rating"].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5.0,12981998
4.0,2518170
1.0,1836990
3.0,1324911
2.0,933101


# Data Cleansing

#### Fix missing values

In [None]:
df.dropna(how="all", inplace=True)

In [None]:
df.dropna(how="all",axis=1, inplace=True)

In [None]:
%%time
df.fillna({"review_title": "", "review_text": ""}, inplace=True)

CPU times: user 6.89 s, sys: 491 ms, total: 7.38 s
Wall time: 7.28 s


#### Combine "review_title" and "review_text" columns into Text

In [None]:
%%time
df["review"] = (df["review_title"].str.rstrip('.!? \n\t') +  ". " +  df["review_text"]).str.lstrip('.!? \n\t')

CPU times: user 20 s, sys: 3.97 s, total: 23.9 s
Wall time: 23.7 s


In [None]:
# Remove 'review_title' and 'review_text' columns
df.drop(columns=["review_title", "review_text"], inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   rating  float64
 1   review  object 
dtypes: float64(1), object(1)
memory usage: 299.0+ MB


In [None]:
df.head()

Unnamed: 0,rating,review
0,5.0,Crazy comfy. Not gonna lie- they are not much ...
1,5.0,Excellent. I love it. Pretty!
2,5.0,Best saddle pads. Huge fan of B Vertigo and th...
3,5.0,Perfect repair kit. I have a great Weaver halt...
4,5.0,Works great. This was great for a slightly too...


# Text Processing

In [8]:
from abc import ABC, abstractmethod

In [9]:
class SentimentTaskHandler(ABC):

  @abstractmethod
  def before_handle(self, data_frame, shared_dict=None):
    pass

  @abstractmethod
  def on_handle(self, data_frame, shared_dict=None, task_number=1):
    pass

  @abstractmethod
  def after_handle(self, data_frame, shared_dict=None):
    pass

In [10]:
class SentimentTaskManager:

  @staticmethod
  def execute(data_frame, page_size=100, cpu_count = 1, command_handler: SentimentTaskHandler = None):
    if command_handler is None:
      raise Exception("'command_handler' object not constructed. Cannot access a 'None' object.")

    total_records = len(data_frame.index)
    total_pages = math.ceil(len(data_frame.index) / page_size)
    current_page = 0

    print(f"Page Size    : {page_size}")
    print(f"Total records: {total_records}")
    print(f"Total pages  : {total_pages}")
    with Manager() as manager:
      processes = []
      shared_dict = manager.dict()

      command_handler.before_handle(data_frame, shared_dict)
      while current_page < total_pages:
        processes.clear()

        for i in range(current_page, total_pages):
          current_page = i + 1
          start = i * page_size
          end = (i + 1) * page_size

          if end > total_records:
            end = total_records

          print("-" * 16)
          print(f"Page: {current_page} - From {start} to {end}")

          processes.append(multiprocessing.Process(target=command_handler.on_handle, args=(data_frame.iloc[start:end], shared_dict, current_page)))

          if current_page % cpu_count == 0:
            break

        print("-" * 16)

        for p in processes:
          p.start()

        for p in processes:
          p.join()

        command_handler.after_handle(data_frame, shared_dict)
        shared_dict.clear()

      print("All tasks are completed")

#### Convert text to lower - Remove digits - Remove stop words

In [None]:
class CleanReviewCommandHandler(SentimentTaskHandler):

  def __init__(self):
    self.__stop_words = stopwords.words('english')


  def __clean_review_core(self, document):
    document = document.lower().translate(str.maketrans('', '', string.punctuation))
    document = re.sub(r'\w*\d\w*', '', document)
    return " ".join([token for token in word_tokenize(document) if len(token) >=3 and token not in self.__stop_words])


  def before_handle(self, data_frame, shared_dict=None):
    print("Set empty value for 'review_cleaned' column")
    data_frame.loc[:,"review_cleaned"] = ""


  def on_handle(self, data_frame, shared_dict=None, task_number=1):
    print(f"Task {task_number} is started at {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")

    if shared_dict is None:
      for i, r in data_frame.iterrows():
        data_frame.at[i, "review_cleaned"] = self.__clean_review_core(r["review"])
    else:
      for i, r in data_frame.iterrows():
        shared_dict[i] = self.__clean_review_core(r["review"])

    print(f"Task {task_number} is completed at {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")


  def after_handle(self, data_frame, shared_dict=None):
    print(f"Merge results from all tasks")
    for k,v in shared_dict.items():
      data_frame.at[k, "review_cleaned"] = v

In [None]:
%%time
cleaned_tasks = [
    { "id": 0, "count": -1, "page_size": 1_000_000, "cpu_count": 10 },  # 33min 17s
    { "id": 1, "count": 100, "page_size": 10, "cpu_count": 10 }
]

selected_id = 0
selected_cleaned_task = cleaned_tasks[selected_id] if selected_id < len(cleaned_tasks) and cleaned_tasks[selected_id]["id"] == selected_id else cleaned_tasks[1]

df["review_cleaned"] = ""
SentimentTaskManager().execute(
    data_frame= df.iloc[: selected_cleaned_task["count"]] if selected_cleaned_task["count"] > 0 else df,
    page_size=selected_cleaned_task["page_size"],
    cpu_count=selected_cleaned_task["cpu_count"],
    command_handler=CleanReviewCommandHandler()
)

Page Size    : 1000000
Total records: 19595170
Total pages  : 20
Set empty value for 'review_cleaned' column
----------------
Page: 1 - From 0 to 1000000
----------------
Page: 2 - From 1000000 to 2000000
----------------
Page: 3 - From 2000000 to 3000000
----------------
Page: 4 - From 3000000 to 4000000
----------------
Page: 5 - From 4000000 to 5000000
----------------
Page: 6 - From 5000000 to 6000000
----------------
Page: 7 - From 6000000 to 7000000
----------------
Page: 8 - From 7000000 to 8000000
----------------
Page: 9 - From 8000000 to 9000000
----------------
Page: 10 - From 9000000 to 10000000
----------------
Task 1 is started at 04/09/2024 07:20:43
Task 2 is started at 04/09/2024 07:20:43
Task 3 is started at 04/09/2024 07:20:43
Task 4 is started at 04/09/2024 07:20:44
Task 5 is started at 04/09/2024 07:20:44
Task 6 is started at 04/09/2024 07:20:44
Task 7 is started at 04/09/2024 07:20:45
Task 8 is started at 04/09/2024 07:20:45
Task 9 is started at 04/09/2024 07:20:45

In [None]:
(df["review_cleaned"].values == '').sum()

23153

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 3 columns):
 #   Column          Dtype  
---  ------          -----  
 0   rating          float64
 1   review          object 
 2   review_cleaned  object 
dtypes: float64(1), object(2)
memory usage: 448.5+ MB


In [None]:
df.head()

Unnamed: 0,rating,review,review_cleaned
0,5.0,Crazy comfy. Not gonna lie- they are not much ...,crazy comfy gon lie much look lol luckily one ...
1,5.0,Excellent. I love it. Pretty!,excellent love pretty
2,5.0,Best saddle pads. Huge fan of B Vertigo and th...,best saddle pads huge fan vertigo dressage pad...
3,5.0,Perfect repair kit. I have a great Weaver halt...,perfect repair kit great weaver halter recentl...
4,5.0,Works great. This was great for a slightly too...,works great great slightly tooshort girth stur...


#### Lemmatize text

In [None]:
class LemmatizeReviewCommandHandler(SentimentTaskHandler):

  def __init__(self):
    self.__nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])


  def __lemmatize_review_core(self, document):
    return " ".join([token.lemma_ for token in self.__nlp(document) if len(token.lemma_) >= 3 and not token.is_stop])


  def before_handle(self, data_frame, shared_dict=None):
    print("Set empty value for 'review_lemmatized' column")
    data_frame.loc[:,"review_lemmatized"] = ""


  def on_handle(self, data_frame, shared_dict=None, task_number=1):
    print(f"Task {task_number} is started at {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")

    if shared_dict is None:
      for i, r in data_frame.iterrows():
        data_frame.at[i, "review_lemmatized"] = self.__lemmatize_review_core(r["review_cleaned"])
    else:
      for i, r in data_frame.iterrows():
        shared_dict[i] = self.__lemmatize_review_core(r["review_cleaned"])

    print(f"Task {task_number} is completed at {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")


  def after_handle(self, data_frame, shared_dict=None):
    print("Merge results from all tasks")
    for k,v in shared_dict.items():
      data_frame.at[k, "review_lemmatized"] = v

In [None]:
%%time
lemmatized_tasks = [
    { "id": 0, "count": -1, "page_size": 200_000, "cpu_count": 50 },         # 1h 43s
    { "id": 1, "count": 4_000_000, "page_size": 200_000, "cpu_count": 20 },  # 16mins
    { "id": 2, "count": 5_000_000, "page_size": 200_000, "cpu_count": 25 },  # 17mins 7s
    { "id": 3, "count": 10_000_000, "page_size": 200_000, "cpu_count": 50 }  # 31min 32s
]

selected_id = 0
selected_lemmatized_task = lemmatized_tasks[selected_id] if selected_id < len(lemmatized_tasks) and lemmatized_tasks[selected_id]["id"] == selected_id else lemmatized_tasks[1]

df["review_lemmatized"] = ""
SentimentTaskManager().execute(
    data_frame=df.iloc[: selected_lemmatized_task["count"]] if selected_lemmatized_task["count"] > 0 else df,
    page_size=selected_lemmatized_task["page_size"],
    cpu_count=selected_lemmatized_task["cpu_count"],
    command_handler=LemmatizeReviewCommandHandler()
)

Page Size    : 200000
Total records: 19595170
Total pages  : 98
Set empty value for 'review_lemmatized' column
----------------
Page: 1 - From 0 to 200000
----------------
Page: 2 - From 200000 to 400000
----------------
Page: 3 - From 400000 to 600000
----------------
Page: 4 - From 600000 to 800000
----------------
Page: 5 - From 800000 to 1000000
----------------
Page: 6 - From 1000000 to 1200000
----------------
Page: 7 - From 1200000 to 1400000
----------------
Page: 8 - From 1400000 to 1600000
----------------
Page: 9 - From 1600000 to 1800000
----------------
Page: 10 - From 1800000 to 2000000
----------------
Page: 11 - From 2000000 to 2200000
----------------
Page: 12 - From 2200000 to 2400000
----------------
Page: 13 - From 2400000 to 2600000
----------------
Page: 14 - From 2600000 to 2800000
----------------
Page: 15 - From 2800000 to 3000000
----------------
Page: 16 - From 3000000 to 3200000
----------------
Page: 17 - From 3200000 to 3400000
----------------
Page: 18 - 

In [None]:
(df["review_lemmatized"].values == '').sum()

27863

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 4 columns):
 #   Column             Dtype  
---  ------             -----  
 0   rating             float64
 1   review             object 
 2   review_cleaned     object 
 3   review_lemmatized  object 
dtypes: float64(1), object(3)
memory usage: 598.0+ MB


In [None]:
df.head()

Unnamed: 0,rating,review,review_cleaned,review_lemmatized
0,5.0,Crazy comfy. Not gonna lie- they are not much ...,crazy comfy gon lie much look lol luckily one ...,crazy comfy gon lie look lol luckily ppl value...
1,5.0,Excellent. I love it. Pretty!,excellent love pretty,excellent love pretty
2,5.0,Best saddle pads. Huge fan of B Vertigo and th...,best saddle pads huge fan vertigo dressage pad...,good saddle pad huge fan vertigo dressage pad ...
3,5.0,Perfect repair kit. I have a great Weaver halt...,perfect repair kit great weaver halter recentl...,perfect repair kit great weaver halter recentl...
4,5.0,Works great. This was great for a slightly too...,works great great slightly tooshort girth stur...,work great great slightly tooshort girth sturd...


In [None]:
# df.drop(columns=["review"], inplace=True)

# Backup Lemmatized Step

In [None]:
# df.to_json(f"{data_path}/Reviews_Lemmatized_Min.zip",  orient="records", lines=True, compression="zip")

# Restore Lemmatized Step

In [None]:
%%time
restore_path_1 = f"{data_path}/Reviews_Lemmatized_Min.zip"

if 'df' not in globals():
  if os.path.exists(restore_path_1):
    print(f"Read data from Reviews_Lemmatized_Min.zip")
    df = pd.read_json(restore_path_1, orient="records", lines=True, compression="zip")
  else:
    print(f"{restore_path_1} not found")

Read data from Reviews_Lemmatized_Min.zip
CPU times: user 2min 18s, sys: 1min 30s, total: 3min 48s
Wall time: 4min


In [None]:
df.head()

Unnamed: 0,rating,review,review_cleaned,review_lemmatized
0,5,Crazy comfy. Not gonna lie- they are not much ...,crazy comfy gon lie much look lol luckily one ...,crazy comfy gon lie look lol luckily ppl value...
1,5,Excellent. I love it. Pretty!,excellent love pretty,excellent love pretty
2,5,Best saddle pads. Huge fan of B Vertigo and th...,best saddle pads huge fan vertigo dressage pad...,good saddle pad huge fan vertigo dressage pad ...
3,5,Perfect repair kit. I have a great Weaver halt...,perfect repair kit great weaver halter recentl...,perfect repair kit great weaver halter recentl...
4,5,Works great. This was great for a slightly too...,works great great slightly tooshort girth stur...,work great great slightly tooshort girth sturd...


# Sentiment Score

In [None]:
class DetectSentimentCommandHandler(SentimentTaskHandler):

  def __init__(self):
    self.__sentiment_analyzer = SentimentIntensityAnalyzer()


  def __calcualte_sentiment_score(self, document):
    return self.__sentiment_analyzer.polarity_scores(document)["compound"]


  def __detect_sentiment_based_on_review_score(self, score):
    return 1 if score >= 0.5 else -1 if score <= -0.5 else 0


  def __detect_sentiment_base_on_review_rating(self, rating):
    return 1 if rating > 3 else -1 if rating < 3 else 0


  def before_handle(self, data_frame, shared_dict=None):
    print(f"Set empty value for 'review_raw_sentiment_score' column")
    data_frame.loc[:, "review_raw_sentiment_score"] = np.nan

    print(f"Set empty value for 'review_raw_sentiment' column")
    data_frame.loc[:, "review_raw_sentiment"] = np.nan

    print(f"Set empty value for 'review_cleaned_sentiment_score' column")
    data_frame.loc[:, "review_cleaned_sentiment_score"] = np.nan

    print(f"Set empty value for 'review_cleaned_sentiment' column")
    data_frame.loc[:, "review_cleaned_sentiment"] = np.nan

    print(f"Set empty value for 'review_lemmatized_sentiment_score' column")
    data_frame.loc[:, "review_lemmatized_sentiment_score"] = np.nan

    print(f"Set empty value for 'review_lemmatized_sentiment' column")
    data_frame.loc[:, "review_lemmatized_sentiment"] = np.nan

    print(f"Set empty value for 'rating_sentiment' column")
    data_frame.loc[:, "rating_sentiment"] = np.nan


  def on_handle(self, data_frame, shared_dict=None, task_number=1):
    print(f"Task {task_number} is started at {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")

    if shared_dict is None:
      for i, r in data_frame.iterrows():
        data_frame.at[i, "review_raw_sentiment_score"] = self.__calcualte_sentiment_score(r["review"])
        data_frame.at[i, "review_raw_sentiment"] = self.__detect_sentiment_based_on_review_score(data_frame.at[i, "review_raw_sentiment_score"])

        data_frame.at[i, "review_cleaned_sentiment_score"] = self.__calcualte_sentiment_score(r["review_cleaned"])
        data_frame.at[i, "review_cleaned_sentiment"] = self.__detect_sentiment_based_on_review_score(data_frame.at[i, "review_cleaned_sentiment_score"])

        data_frame.at[i, "review_lemmatized_sentiment_score"] = self.__calcualte_sentiment_score(r["review_lemmatized"])
        data_frame.at[i, "review_lemmatized_sentiment"] = self.__detect_sentiment_based_on_review_score(data_frame.at[i, "review_lemmatized_sentiment_score"])

        data_frame.at[i, "rating_sentiment"] = self.__detect_sentiment_base_on_review_rating(r["rating"])
    else:
      for i, r in data_frame.iterrows():
        shared_dict[i] = (
            self.__calcualte_sentiment_score(r["review"]),
            self.__calcualte_sentiment_score(r["review_cleaned"]),
            self.__calcualte_sentiment_score(r["review_lemmatized"]),
            self.__detect_sentiment_base_on_review_rating(r["rating"])
        )

    print(f"Task {task_number} is completed at {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}")


  def after_handle(self, data_frame, shared_dict=None):
    print("Merge results from all tasks")
    for k,v in shared_dict.items():
      data_frame.at[k, "review_raw_sentiment_score"] = v[0]
      data_frame.at[k, "review_raw_sentiment"] = self.__detect_sentiment_based_on_review_score(v[0])
      data_frame.at[k, "review_cleaned_sentiment_score"] = v[1]
      data_frame.at[k, "review_cleaned_sentiment"] = self.__detect_sentiment_based_on_review_score(v[1])
      data_frame.at[k, "review_lemmatized_sentiment_score"] = v[2]
      data_frame.at[k, "review_lemmatized_sentiment"] = self.__detect_sentiment_based_on_review_score(v[2])
      data_frame.at[k, "rating_sentiment"] = v[3]


In [None]:
%%time
sentiment_tasks = [
    { "id": 0, "count": -1, "page_size": 100_000, "cpu_count": 10 },            # 1h 14min 37s
    { "id": 1, "count": 100, "page_size": 100, "cpu_count": 1 },                # 2.72 s
    { "id": 2, "count": 10_000, "page_size": 10_000, "cpu_count": 1 },          # 24.1 s
    { "id": 3, "count": 100_000, "page_size": 100_000, "cpu_count": 1 },        # 2min 33s
    { "id": 4, "count": 100_000, "page_size": 10_000, "cpu_count": 10 },        # 21.4 s
    { "id": 5, "count": 1_000_000, "page_size": 1_000_000, "cpu_count": 1 },    # 13min 29s
    { "id": 6, "count": 1_000_000, "page_size": 100_000, "cpu_count": 10 },     # 2min 50s
    { "id": 7, "count": 2_000_000, "page_size": 100_000, "cpu_count": 20 },     # 6min 20s
    { "id": 8, "count": 2_000_000, "page_size": 200_000, "cpu_count": 10 },     # 5min 30s
    { "id": 9, "count": 10_000_000, "page_size": 1_000_000, "cpu_count": 10 }   # 27min
]

selected_id = 0
selected_sentiment_task = sentiment_tasks[selected_id] if selected_id < len(sentiment_tasks) and sentiment_tasks[selected_id]["id"] == selected_id else sentiment_tasks[1]

df["review_raw_sentiment_score"] = np.nan
df["review_raw_sentiment"] = np.nan
df["review_cleaned_sentiment_score"] = np.nan
df["review_cleaned_sentiment"] = np.nan
df["review_lemmatized_sentiment_score"] = np.nan
df["review_lemmatized_sentiment"] = np.nan
df["rating_sentiment"] = np.nan

SentimentTaskManager().execute(
    data_frame=df.iloc[: selected_sentiment_task["count"]] if selected_sentiment_task["count"] > 0 else df,
    page_size=selected_sentiment_task["page_size"],
    cpu_count=selected_sentiment_task["cpu_count"],
    command_handler=DetectSentimentCommandHandler()
)

Page Size    : 100000
Total records: 19595170
Total pages  : 196
Set empty value for 'review_raw_sentiment_score' column
Set empty value for 'review_raw_sentiment' column
Set empty value for 'review_cleaned_sentiment_score' column
Set empty value for 'review_cleaned_sentiment' column
Set empty value for 'review_lemmatized_sentiment_score' column
Set empty value for 'review_lemmatized_sentiment' column
Set empty value for 'rating_sentiment' column
----------------
Page: 1 - From 0 to 100000
----------------
Page: 2 - From 100000 to 200000
----------------
Page: 3 - From 200000 to 300000
----------------
Page: 4 - From 300000 to 400000
----------------
Page: 5 - From 400000 to 500000
----------------
Page: 6 - From 500000 to 600000
----------------
Page: 7 - From 600000 to 700000
----------------
Page: 8 - From 700000 to 800000
----------------
Page: 9 - From 800000 to 900000
----------------
Page: 10 - From 900000 to 1000000
----------------
Task 1 is started at 04/09/2024 12:13:21
Task

In [None]:
+(df["review_cleaned_sentiment"].values == np.nan).sum()

0

In [None]:
(df["review_lemmatized_sentiment"].values == np.nan).sum()

0

In [None]:
(df["rating_sentiment"].values == np.nan).sum()

0

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19595170 entries, 0 to 19595169
Data columns (total 11 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   rating                             int64  
 1   review                             object 
 2   review_cleaned                     object 
 3   review_lemmatized                  object 
 4   review_raw_sentiment_score         float64
 5   review_raw_sentiment               float64
 6   review_cleaned_sentiment_score     float64
 7   review_cleaned_sentiment           float64
 8   review_lemmatized_sentiment_score  float64
 9   review_lemmatized_sentiment        float64
 10  rating_sentiment                   float64
dtypes: float64(7), int64(1), object(3)
memory usage: 1.6+ GB


In [None]:
df.head()

Unnamed: 0,rating,review,review_cleaned,review_lemmatized,review_raw_sentiment_score,review_raw_sentiment,review_cleaned_sentiment_score,review_cleaned_sentiment,review_lemmatized_sentiment_score,review_lemmatized_sentiment,rating_sentiment
0,5,Crazy comfy. Not gonna lie- they are not much ...,crazy comfy gon lie much look lol luckily one ...,crazy comfy gon lie look lol luckily ppl value...,0.9974,1.0,0.9951,1.0,0.9959,1.0,1.0
1,5,Excellent. I love it. Pretty!,excellent love pretty,excellent love pretty,0.908,1.0,0.9022,1.0,0.9022,1.0,1.0
2,5,Best saddle pads. Huge fan of B Vertigo and th...,best saddle pads huge fan vertigo dressage pad...,good saddle pad huge fan vertigo dressage pad ...,0.9143,1.0,0.9091,1.0,0.8298,1.0,1.0
3,5,Perfect repair kit. I have a great Weaver halt...,perfect repair kit great weaver halter recentl...,perfect repair kit great weaver halter recentl...,0.9576,1.0,0.9496,1.0,0.9485,1.0,1.0
4,5,Works great. This was great for a slightly too...,works great great slightly tooshort girth stur...,work great great slightly tooshort girth sturd...,0.9323,1.0,0.9246,1.0,0.9246,1.0,1.0


In [None]:
df.drop(columns=["rating", "review", "review_cleaned", "review_raw_sentiment_score", "review_cleaned_sentiment_score", "review_lemmatized_sentiment_score"], inplace=True)

In [None]:
df.head()

Unnamed: 0,review_lemmatized,review_raw_sentiment,review_cleaned_sentiment,review_lemmatized_sentiment,rating_sentiment
0,crazy comfy gon lie look lol luckily ppl value...,1.0,1.0,1.0,1.0
1,excellent love pretty,1.0,1.0,1.0,1.0
2,good saddle pad huge fan vertigo dressage pad ...,1.0,1.0,1.0,1.0
3,perfect repair kit great weaver halter recentl...,1.0,1.0,1.0,1.0
4,work great great slightly tooshort girth sturd...,1.0,1.0,1.0,1.0


# Backup Sentiment Score Step

In [None]:
# df.to_json(f"{data_path}/Reviews_Sentiment_Min.zip",  orient="records", lines=True, compression="zip")

# Restore Sentiment Score Step

In [11]:
%%time
restore_path_2 = f"{data_path}/Reviews_Sentiment_Min.zip"

if 'df' not in globals():
  if os.path.exists(restore_path_2):
    print(f"Read data from Reviews_Sentiment_Min.zip")
    df = pd.read_json(restore_path_2, orient="records", lines=True, compression="zip")
  else:
    print(f"{restore_path_2} not found")

Read data from Reviews_Sentiment_Min.zip
CPU times: user 1min 13s, sys: 38.1 s, total: 1min 52s
Wall time: 2min


In [12]:
df.head()

Unnamed: 0,review_lemmatized,review_raw_sentiment,review_cleaned_sentiment,review_lemmatized_sentiment,rating_sentiment
0,crazy comfy gon lie look lol luckily ppl value...,1,1,1,1
1,excellent love pretty,1,1,1,1
2,good saddle pad huge fan vertigo dressage pad ...,1,1,1,1
3,perfect repair kit great weaver halter recentl...,1,1,1,1
4,work great great slightly tooshort girth sturd...,1,1,1,1


# Sentiment Detection

In [28]:
sentiment_columns = ["review_raw_sentiment", "review_cleaned_sentiment", "review_lemmatized_sentiment", "rating_sentiment"]

In [29]:
%%time
df["total_points"] = df[sentiment_columns].sum(axis="columns")

CPU times: user 1.74 s, sys: 408 ms, total: 2.15 s
Wall time: 2.12 s


In [30]:
%time
df["neutral_count"] = 4 - (df[sentiment_columns] ** 2).sum(axis="columns")

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 12.2 µs


In [33]:
penalty = -0.1

In [38]:
%time
df["adjusted_total_points"] = df["total_points"] * (1 + df["neutral_count"] * penalty)

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 11.7 µs


In [39]:
%time
df["final_sentiment"] = df["adjusted_total_points"].map(lambda x: 1 if x >= 0.9 else -1 if x <= -0.9 else 0)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 12.9 µs


In [41]:
df.head(20)

Unnamed: 0,review_lemmatized,review_raw_sentiment,review_cleaned_sentiment,review_lemmatized_sentiment,rating_sentiment,total_points,neutral_count,adjusted_total_points,final_sentiment
0,crazy comfy gon lie look lol luckily ppl value...,1,1,1,1,4,0,4.0,1
1,excellent love pretty,1,1,1,1,4,0,4.0,1
2,good saddle pad huge fan vertigo dressage pad ...,1,1,1,1,4,0,4.0,1
3,perfect repair kit great weaver halter recentl...,1,1,1,1,4,0,4.0,1
4,work great great slightly tooshort girth sturd...,1,1,1,1,4,0,4.0,1
5,great stirrup bar grip grip pretty great great...,1,1,1,1,4,0,4.0,1
6,perfect perfect boy look adorable love padding...,1,1,1,1,4,0,4.0,1
7,awesome leather amazing trainer notice right a...,1,1,1,1,4,0,4.0,1
8,nice product size product sizing way woman pur...,1,1,1,1,4,0,4.0,1
9,mixed feeling love chalk bag online person lik...,1,1,1,1,4,0,4.0,1
