In [44]:
# import warnings
# warnings.filterwarnings("ignore")

import pandas as pd
import nltk
import re
import random
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

random_state = 42
random.seed(random_state)
from nltk.corpus import stopwords

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
data_path = "./data/combined.csv"
df = pd.read_csv(data_path, low_memory=False)
print(df.isna().sum() / len(df) * 100)

overall            0.000000
verified           0.000000
reviewTime         0.000000
reviewerID         0.000000
asin               0.000000
style             50.710965
reviewerName       0.016280
reviewText         0.049506
summary            0.025620
unixReviewTime     0.000000
vote              84.972458
image             97.894193
dtype: float64


In [46]:
print(df.isna().sum())

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             380030
reviewerName         122
reviewText           371
summary              192
unixReviewTime         0
vote              636787
image             733623
dtype: int64


In [47]:
shpae = df.shape
print(df.isna().sum().sum() / (shpae[0] * shpae[1]) * 100)

19.47241852636673


In [48]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


In [49]:
df = df[["overall", "reviewText", "summary", "verified"]]

In [50]:
print(df.isna().sum() / len(df) * 100)

overall       0.000000
reviewText    0.049506
summary       0.025620
verified      0.000000
dtype: float64


In [51]:
df = df.dropna()

In [52]:
print(df.isna().sum() / len(df) * 100)

overall       0.0
reviewText    0.0
summary       0.0
verified      0.0
dtype: float64


In [53]:
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewText"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary"], axis=1, inplace=True)
df.head()

Unnamed: 0,reviewText,verified,sentiment
0,Five Stars As advertised. Reasonably priced,True,1
1,Good for the face Like the oder and the feel w...,True,1
2,Smells awful I bought this to smell nice after...,True,-1
3,Truth is There IS Nothing Like an AQUA VELVA M...,False,1
4,Bvlgari Shampoo If you ever want to feel pampe...,True,1


In [54]:
df["sentiment"].value_counts()

sentiment
 1    657241
 0     47109
-1     44501
Name: count, dtype: int64

In [55]:
df.head()

Unnamed: 0,reviewText,verified,sentiment
0,Five Stars As advertised. Reasonably priced,True,1
1,Good for the face Like the oder and the feel w...,True,1
2,Smells awful I bought this to smell nice after...,True,-1
3,Truth is There IS Nothing Like an AQUA VELVA M...,False,1
4,Bvlgari Shampoo If you ever want to feel pampe...,True,1


In [56]:
df = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=100000, random_state=random_state, replace=True))
    .reset_index(drop=True)
)
df["sentiment"].value_counts()

sentiment
-1    100000
 0    100000
 1    100000
Name: count, dtype: int64

In [57]:
df.head()

Unnamed: 0,reviewText,verified,sentiment
0,Intuit is One Unethical Company. Any alternati...,False,-1
1,Very disappointed Very disappointed guitar cam...,True,-1
2,Not what I was hoping for The rings did not pe...,True,-1
3,Mine broke. Very light use. Padded case. Maybe...,False,-1
4,"Two Stars Not the greatest, really flimsy.",True,-1


In [58]:
STOP_WORDS = set(stopwords.words("english"))

In [59]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [60]:
def preprocess_type(word, type_proc):
    match type_proc:
        case "word":
            return word
        case "stem":
            return PorterStemmer().stem(word)
        case "lem":
            return WordNetLemmatizer().lemmatize(word)

In [61]:
def my_train_test_split(cols, test_size, df=df, random_state=random_state):
    x_train, x_test, y_train, y_test = train_test_split(
        df[cols], df["sentiment"], test_size=test_size, random_state=random_state
    )
    return x_train, x_test, y_train, y_test

In [62]:
def apply_preprocessing(proc, x_train, x_test):
    if proc is None:
        return x_train, x_test
    x_train["reviewText"] = x_train["reviewText"].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    x_test["reviewText"] = x_test["reviewText"].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    return x_train, x_test

In [63]:
def pipeline(cols, test_size, proc, vectorizer, df=df, random_state=random_state):
    assert "reviewText" in cols
    x_train, x_test, y_train, y_test = my_train_test_split(
        cols, test_size, df, random_state
    )
    x_train, x_test = apply_preprocessing(proc, x_train, x_test)
    if vectorizer == "bow":
        vectorizer = CountVectorizer()
    elif vectorizer == "tfidf":
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid Vectorizer")
    x_train = vectorizer.fit_transform(x_train["reviewText"])
    x_test = vectorizer.transform(x_test["reviewText"])
    return x_train, x_test, y_train, y_test

In [64]:
cols = ["reviewText"]
test_size = 0.2
proc = None
vectorizer = "bow"
x_train, x_test, y_train, y_test = pipeline(cols, test_size, proc, vectorizer)