In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.functions import filter, col, first, round, concat, lit, to_date, when
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
import pandas as pd
import numpy as np
import string
import re



df_reviews = pd.read_excel('../RetailsReviews.xlsx')

spark = SparkSession.builder.appName('pyspark').getOrCreate()




23/08/12 20:18:21 WARN Utils: Your hostname, Woohyuns-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 100.95.192.34 instead (on interface en0)
23/08/12 20:18:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/12 20:18:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sdf_preprocessed = spark.createDataFrame(df_reviews)

In [3]:
# Filtering out just to show an example of "CONTENT" with "[This review was collected as part of a promotion.]"
sdf_hd = sdf_preprocessed.filter(
    (col('RETAILER') == 'THE HOME DEPOT') &
    (col('CONTENT').contains('This review was collected as part of a promotion'))
)


In [4]:
# Function to display df with limited text length when setting .show(truncate=False)
def show_df_with_max_length(df, column_name, max_length=100):
    df.withColumn(column_name, col(column_name).substr(1, max_length)).show(5, truncate=False)

In [5]:
# Display the 'CONTENT' column with limited text length by appying the function just created
show_df_with_max_length(sdf_hd.select('RETAILER','REVIEWER_NAME','TITLE', 'CONTENT'), 'CONTENT', max_length=52)

                                                                                

+--------------+-------------+---------------------------------------+----------------------------------------------------+
|RETAILER      |REVIEWER_NAME|TITLE                                  |CONTENT                                             |
+--------------+-------------+---------------------------------------+----------------------------------------------------+
|THE HOME DEPOT|Mo87         |Nice and handy Vacuum                  |[This review was collected as part of a promotion.] |
|THE HOME DEPOT|Liz68        |On of the best an the market ...       |[This review was collected as part of a promotion.] |
|THE HOME DEPOT|HeatherRN    |Love this Vac!                         |[This review was collected as part of a promotion.] |
|THE HOME DEPOT|Jamieberg1451|It’s a good vacuum                     |[This review was collected as part of a promotion.] |
|THE HOME DEPOT|Fayt413      |It works great, when its not plugged up|[This review was collected as part of a promotion.] |
+-------

In [6]:
''' 1 - Removing "[This review was collected as part of a promotion."] from Home Depot reviews
        Escaped the special characters in the regular expression pattern using backslashes,
        Added the r prefix before the pattern to indicate a raw string literal'''

sdf_hd = sdf_hd.withColumn('CONTENT', regexp_replace('CONTENT', r'\[This review was collected as part of a promotion\.\]', ''))

sdf_hd = sdf_hd.filter(
    (col('RETAILER') == 'THE HOME DEPOT')
)
show_df_with_max_length(sdf_hd.select('RETAILER','REVIEWER_NAME','TITLE', 'CONTENT'), 'CONTENT', max_length=52)

# Applying to the main df
sdf_preprocessed = sdf_preprocessed.withColumn('CONTENT', regexp_replace('CONTENT', r'\[This review was collected as part of a promotion\.\]', ''))

+--------------+-------------+---------------------------------------+------------------------------------------------------+
|RETAILER      |REVIEWER_NAME|TITLE                                  |CONTENT                                               |
+--------------+-------------+---------------------------------------+------------------------------------------------------+
|THE HOME DEPOT|Mo87         |Nice and handy Vacuum                  | I bought this vacuum 2 weeks ago. In summary:\r\n1. E|
|THE HOME DEPOT|Liz68        |On of the best an the market ...       | I intended to buy this LG CORDZERO ,because I have   |
|THE HOME DEPOT|HeatherRN    |Love this Vac!                         | I recently recieved this vacuum in the mail. It is   |
|THE HOME DEPOT|Jamieberg1451|It’s a good vacuum                     | I wanted to love this. I’d been so excited for it t  |
|THE HOME DEPOT|Fayt413      |It works great, when its not plugged up| It's easy to charge and add attachments. I don'

In [7]:
# Just to show the number of dropped rows later
original_count = sdf_preprocessed.count()

# 2 - Dropping duplicate rows with the same 'REVIEWER_NAME', 'TITLE', 'CONTENT' values
sdf_preprocessed = sdf_preprocessed.dropDuplicates(subset=['REVIEWER_NAME', 'TITLE', 'CONTENT'])

new_count = sdf_preprocessed.count()
dropped_count = original_count - new_count

# Show the counts
display(f"Original count: {original_count}")
display(f"New count after dropping duplicates: {new_count}")
display(f"Number of dropped duplicates: {dropped_count}")


'Original count: 359'

'New count after dropping duplicates: 359'

'Number of dropped duplicates: 0'

In [8]:
# 3 - Combine title and content columns
sdf_preprocessed = sdf_preprocessed.withColumn('REVIEW', concat(col('TITLE'), lit(' '), col('CONTENT')))
sdf_preprocessed.show(10)

+--------------+-----------+------+--------------------+---------------+--------------------+--------------------+--------------------+
|      RETAILER|    PRODUCT|RATING|           POST_DATE|  REVIEWER_NAME|               TITLE|             CONTENT|              REVIEW|
+--------------+-----------+------+--------------------+---------------+--------------------+--------------------+--------------------+
|      BEST BUY|LG A939KBGS|     5|May 26, 2022 10:3...|          2leo2|Sweet, Light, Qui...|The LG vacuum is ...|Sweet, Light, Qui...|
|      BEST BUY|LG A939KBGS|     5|Dec 16, 2022 12:4...|      88XOHOX88|          Love this!|Auto-empty leaves...|Love this! Auto-e...|
|THE HOME DEPOT|LG A939KBGS|     4|         Apr 1, 2023|     AA23250706|     It really sucks| We’ve had this v...|It really sucks  ...|
|THE HOME DEPOT|LG A939KBGS|     5|                 NaN|         AMANDA|                 NaN|                 NaN|             NaN NaN|
|THE HOME DEPOT|LG A939KBGS|     5|             

In [9]:
# 4 - Drop rows with empty 'REVIEW' column
sdf_preprocessed = sdf_preprocessed.filter(col('REVIEW') != 'NaN NaN')
sdf_preprocessed.show(10)

display(f"Previous Row Count: {new_count}")
new_count = sdf_preprocessed.count()
display(f"New Row Count: {new_count}")

+--------------+-----------+------+--------------------+---------------+--------------------+--------------------+--------------------+
|      RETAILER|    PRODUCT|RATING|           POST_DATE|  REVIEWER_NAME|               TITLE|             CONTENT|              REVIEW|
+--------------+-----------+------+--------------------+---------------+--------------------+--------------------+--------------------+
|      BEST BUY|LG A939KBGS|     5|May 26, 2022 10:3...|          2leo2|Sweet, Light, Qui...|The LG vacuum is ...|Sweet, Light, Qui...|
|      BEST BUY|LG A939KBGS|     5|Dec 16, 2022 12:4...|      88XOHOX88|          Love this!|Auto-empty leaves...|Love this! Auto-e...|
|THE HOME DEPOT|LG A939KBGS|     4|         Apr 1, 2023|     AA23250706|     It really sucks| We’ve had this v...|It really sucks  ...|
|      BEST BUY|LG A939KBGS|     5|Jun 18, 2023 4:37 PM|         AaKomo|Worth the High Cost!|We love our LG an...|Worth the High Co...|
|      BEST BUY|LG A939KBGS|     4|Jan 23, 2023 

'Previous Row Count: 359'

'New Row Count: 353'

In [10]:
''' 5 - 'POST_DATE' column contains different formats from each retailer 
         Converting them to 'yyyy-MM-dd' format'''

# Identify distinct date formats
distinct_retailer_date = sdf_preprocessed.select('RETAILER', 'POST_DATE').distinct()
distinct_retailer_date.show()

+--------------+--------------------+
|      RETAILER|           POST_DATE|
+--------------+--------------------+
|        AMAZON|       July 13, 2023|
|        AMAZON|        July 3, 2023|
|      BEST BUY|May 27, 2022 9:09 PM|
|        AMAZON|         May 8, 2022|
|THE HOME DEPOT|         Jun 7, 2023|
|      BEST BUY|May 25, 2022 10:0...|
|      BEST BUY|Jan 23, 2023 11:0...|
|THE HOME DEPOT|        Aug 31, 2022|
|THE HOME DEPOT|         Sep 4, 2022|
|        AMAZON|        June 3, 2023|
|THE HOME DEPOT|        Nov 26, 2021|
|THE HOME DEPOT|        Dec 18, 2022|
|      BEST BUY|Jun 3, 2022 11:19 PM|
|      BEST BUY|May 24, 2022 3:58 PM|
|        AMAZON|        June 7, 2023|
|        AMAZON|    October 22, 2022|
|        AMAZON|       June 21, 2023|
|THE HOME DEPOT|         Apr 8, 2022|
|THE HOME DEPOT|        Jun 28, 2022|
|      BEST BUY|Jul 15, 2022 10:1...|
+--------------+--------------------+
only showing top 20 rows



In [11]:
# POST_DATE formats
# Amazon: MMM dd, yyyy 
# Best Buy: MMM dd, yyyy hh:mm a
# The Home Depot: MM dd, yyyy

# Set the legacy time parser policy
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Assuming 'POST_DATE' is a string column containing date information
sdf_preprocessed = sdf_preprocessed.withColumn(
    "POST_DATE",
    when(to_date('POST_DATE', 'MMM dd, yyyy hh:mm a').isNotNull(),
         to_date('POST_DATE', 'MMM dd, yyyy hh:mm a'))
    .when(to_date('POST_DATE', 'MMM dd, yyyy').isNotNull(),
         to_date('POST_DATE', 'MMM dd, yyyy'))
    .otherwise(None)  # If no format matches, set the column to None
)

# Show the DataFrame with the new date column
sdf_preprocessed.show()

+--------------+-----------+------+----------+---------------+--------------------+--------------------+--------------------+
|      RETAILER|    PRODUCT|RATING| POST_DATE|  REVIEWER_NAME|               TITLE|             CONTENT|              REVIEW|
+--------------+-----------+------+----------+---------------+--------------------+--------------------+--------------------+
|      BEST BUY|LG A939KBGS|     5|2022-05-26|          2leo2|Sweet, Light, Qui...|The LG vacuum is ...|Sweet, Light, Qui...|
|      BEST BUY|LG A939KBGS|     5|2022-12-16|      88XOHOX88|          Love this!|Auto-empty leaves...|Love this! Auto-e...|
|THE HOME DEPOT|LG A939KBGS|     4|2023-04-01|     AA23250706|     It really sucks| We’ve had this v...|It really sucks  ...|
|      BEST BUY|LG A939KBGS|     5|2023-06-18|         AaKomo|Worth the High Cost!|We love our LG an...|Worth the High Co...|
|      BEST BUY|LG A939KBGS|     4|2023-01-23|          Aaron|       Great product|Its great for wha...|Great product 

In [12]:
df = sdf_preprocessed.toPandas()
print(df.head())


         RETAILER      PRODUCT  RATING   POST_DATE REVIEWER_NAME  \
0        BEST BUY  LG A939KBGS       5  2022-05-26         2leo2   
1        BEST BUY  LG A939KBGS       5  2022-12-16     88XOHOX88   
2  THE HOME DEPOT  LG A939KBGS       4  2023-04-01    AA23250706   
3        BEST BUY  LG A939KBGS       5  2023-06-18        AaKomo   
4        BEST BUY  LG A939KBGS       4  2023-01-23         Aaron   

                             TITLE  \
0  Sweet, Light, Quiet, and Smooth   
1                       Love this!   
2                  It really sucks   
3             Worth the High Cost!   
4                    Great product   

                                             CONTENT  \
0  The LG vacuum is great, lightweight and cleans...   
1  Auto-empty leaves the canister like-new. Very ...   
2   We’ve had this vacuum a few weeks and are com...   
3  We love our LG and with the suction base, it m...   
4  Its great for what it’s made for. Hard floors....   

                         

In [13]:
df = df.drop(columns=['TITLE', 'CONTENT'])
df.insert(0, 'ID', range(0, 0 + len(df)))

In [14]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [15]:
def clean_text(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [16]:
df['review_clean'] = df['REVIEW'].apply(lambda x: clean_text(x.lower()))

In [17]:
df

Unnamed: 0,ID,RETAILER,PRODUCT,RATING,POST_DATE,REVIEWER_NAME,REVIEW,review_clean
0,0,BEST BUY,LG A939KBGS,5,2022-05-26,2leo2,"Sweet, Light, Quiet, and Smooth The LG vacuum ...","[sweet, light, quiet, smooth, lg, vacuum, grea..."
1,1,BEST BUY,LG A939KBGS,5,2022-12-16,88XOHOX88,Love this! Auto-empty leaves the canister like...,"[love, autoempty, leaves, canister, likenew, i..."
2,2,THE HOME DEPOT,LG A939KBGS,4,2023-04-01,AA23250706,It really sucks We’ve had this vacuum a few w...,"[really, sucks, vacuum, weeks, completely, sat..."
3,3,BEST BUY,LG A939KBGS,5,2023-06-18,AaKomo,Worth the High Cost! We love our LG and with t...,"[worth, high, cost, love, lg, suction, base, m..."
4,4,BEST BUY,LG A939KBGS,4,2023-01-23,Aaron,Great product Its great for what it’s made for...,"[great, product, great, made, hard, floors, ex..."
...,...,...,...,...,...,...,...,...
348,348,THE HOME DEPOT,LG A939KBGS,5,,tmle,NaN Very nice station comes with accessories ...,"[nan, nice, station, comes, accessories, fit, ..."
349,349,THE HOME DEPOT,LG A939KBGS,1,2022-06-19,tt226,Brand New Unit Came Broken I was very happy an...,"[brand, new, unit, came, broken, happy, excite..."
350,350,BEST BUY,LG A939KBGS,5,2022-05-30,underdog217,This certainly doesn’t suck This vacuum is ama...,"[certainly, suck, vacuum, amazing, lot, great,..."
351,351,THE HOME DEPOT,LG A939KBGS,5,2021-10-06,wocket,Love this vacuum! This is my first wireless v...,"[love, vacuum, first, wireless, vacuum, life, ..."


In [18]:
# Lemmatize

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text


df['lemmatized'] = df['review_clean'].apply(lambda x: lemmatizing(x))

df.head(10)


Unnamed: 0,ID,RETAILER,PRODUCT,RATING,POST_DATE,REVIEWER_NAME,REVIEW,review_clean,lemmatized
0,0,BEST BUY,LG A939KBGS,5,2022-05-26,2leo2,"Sweet, Light, Quiet, and Smooth The LG vacuum ...","[sweet, light, quiet, smooth, lg, vacuum, grea...","[sweet, light, quiet, smooth, lg, vacuum, grea..."
1,1,BEST BUY,LG A939KBGS,5,2022-12-16,88XOHOX88,Love this! Auto-empty leaves the canister like...,"[love, autoempty, leaves, canister, likenew, i...","[love, autoempty, leaf, canister, likenew, imp..."
2,2,THE HOME DEPOT,LG A939KBGS,4,2023-04-01,AA23250706,It really sucks We’ve had this vacuum a few w...,"[really, sucks, vacuum, weeks, completely, sat...","[really, suck, vacuum, week, completely, satis..."
3,3,BEST BUY,LG A939KBGS,5,2023-06-18,AaKomo,Worth the High Cost! We love our LG and with t...,"[worth, high, cost, love, lg, suction, base, m...","[worth, high, cost, love, lg, suction, base, m..."
4,4,BEST BUY,LG A939KBGS,4,2023-01-23,Aaron,Great product Its great for what it’s made for...,"[great, product, great, made, hard, floors, ex...","[great, product, great, made, hard, floor, exp..."
5,5,THE HOME DEPOT,LG A939KBGS,5,2023-01-11,Aaron68174,"The best All In One Had it for a while now, it...","[best, one, first, vacuum, ive, bought, since,...","[best, one, first, vacuum, ive, bought, since,..."
6,6,THE HOME DEPOT,LG A939KBGS,5,2022-09-17,AceofSpades1996,10/10 Would Recommend! I have owned this vacu...,"[1010, would, recommend, owned, vacuum, 6, mon...","[1010, would, recommend, owned, vacuum, 6, mon..."
7,7,BEST BUY,LG A939KBGS,5,2022-05-29,AcurNet,Sleek Powerful Lightweight Vacuum The LG - Cor...,"[sleek, powerful, lightweight, vacuum, lg, cor...","[sleek, powerful, lightweight, vacuum, lg, cor..."
8,8,THE HOME DEPOT,LG A939KBGS,5,2022-07-26,Adele2301,Best vacuum I’ve had this vacuum for a few mo...,"[best, vacuum, vacuum, months, love, house, fr...","[best, vacuum, vacuum, month, love, house, fre..."
9,9,THE HOME DEPOT,LG A939KBGS,5,2021-11-15,Adolin,Good vacuum This thing is so cool it boxed up...,"[good, vacuum, thing, cool, boxed, perfectly, ...","[good, vacuum, thing, cool, boxed, perfectly, ..."


In [19]:
df['review_clean'] = df.review_clean.apply(' '.join)
df.head()

Unnamed: 0,ID,RETAILER,PRODUCT,RATING,POST_DATE,REVIEWER_NAME,REVIEW,review_clean,lemmatized
0,0,BEST BUY,LG A939KBGS,5,2022-05-26,2leo2,"Sweet, Light, Quiet, and Smooth The LG vacuum ...",sweet light quiet smooth lg vacuum great light...,"[sweet, light, quiet, smooth, lg, vacuum, grea..."
1,1,BEST BUY,LG A939KBGS,5,2022-12-16,88XOHOX88,Love this! Auto-empty leaves the canister like...,love autoempty leaves canister likenew impressed,"[love, autoempty, leaf, canister, likenew, imp..."
2,2,THE HOME DEPOT,LG A939KBGS,4,2023-04-01,AA23250706,It really sucks We’ve had this vacuum a few w...,really sucks vacuum weeks completely satisfied...,"[really, suck, vacuum, week, completely, satis..."
3,3,BEST BUY,LG A939KBGS,5,2023-06-18,AaKomo,Worth the High Cost! We love our LG and with t...,worth high cost love lg suction base makes muc...,"[worth, high, cost, love, lg, suction, base, m..."
4,4,BEST BUY,LG A939KBGS,4,2023-01-23,Aaron,Great product Its great for what it’s made for...,great product great made hard floors expect is...,"[great, product, great, made, hard, floor, exp..."


In [20]:
# N-grams

from nltk.util import ngrams


def extract_ngrams(data, num):
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [' '.join(grams) for grams in n_grams]

In [21]:
df['ngram2'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 2))
# df['ngram3'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 3))
# df['ngram4'] = df['review_clean'].apply(lambda x: extract_ngrams(x, 4))

df.head()

Unnamed: 0,ID,RETAILER,PRODUCT,RATING,POST_DATE,REVIEWER_NAME,REVIEW,review_clean,lemmatized,ngram2
0,0,BEST BUY,LG A939KBGS,5,2022-05-26,2leo2,"Sweet, Light, Quiet, and Smooth The LG vacuum ...",sweet light quiet smooth lg vacuum great light...,"[sweet, light, quiet, smooth, lg, vacuum, grea...","[sweet light, light quiet, quiet smooth, smoot..."
1,1,BEST BUY,LG A939KBGS,5,2022-12-16,88XOHOX88,Love this! Auto-empty leaves the canister like...,love autoempty leaves canister likenew impressed,"[love, autoempty, leaf, canister, likenew, imp...","[love autoempty, autoempty leaves, leaves cani..."
2,2,THE HOME DEPOT,LG A939KBGS,4,2023-04-01,AA23250706,It really sucks We’ve had this vacuum a few w...,really sucks vacuum weeks completely satisfied...,"[really, suck, vacuum, week, completely, satis...","[really sucks, sucks vacuum, vacuum weeks, wee..."
3,3,BEST BUY,LG A939KBGS,5,2023-06-18,AaKomo,Worth the High Cost! We love our LG and with t...,worth high cost love lg suction base makes muc...,"[worth, high, cost, love, lg, suction, base, m...","[worth high, high cost, cost love, love lg, lg..."
4,4,BEST BUY,LG A939KBGS,4,2023-01-23,Aaron,Great product Its great for what it’s made for...,great product great made hard floors expect is...,"[great, product, great, made, hard, floor, exp...","[great product, product great, great made, mad..."


In [22]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [23]:
model_name = f'cardiffnlp/twitter-roberta-base-sentiment'

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


def polarity_scores_roberta(review):
    encoded_text = tokenizer(review, padding=True, truncation=True, max_length=512, return_tensors='pt')
    encoded_text
    # print(encoded_text)
    output = model(**encoded_text)
    # output
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores
    scores_dict = {
        'negative': scores[0],
        'neutral': scores[1],
        'positive': scores[2]
    }
    return scores_dict


res = {}

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['REVIEW']
        myid = row['ID']
        # vader_results = sia.polarity_scores(text)
        roberta_result = polarity_scores_roberta(text)
        res[myid] = {**roberta_result}
    except RuntimeError:
        print(f'Broke for id {myid}')

df_sentiment = pd.DataFrame(res).T
df_sentiment = df_sentiment.reset_index().rename(columns={'index': 'ID'})
df_sentiment = df_sentiment.merge(df, how='left')
df_sentiment.head()
sns.pairplot(data=df_sentiment,
             vars=['negative', 'neutral', 'positive'],
             hue='RATING',
             palette='tab10'
             )


In [24]:
# [Adding 'positivity' column to sort positive, neutral and negative reviews]

# SORTING
# METHOD
# - positivity == '1'  ->  {Reviews
# with user ratings 4 and 5} AND {'positive'} sentiment is greater than {'negative' + 'neutral'}
# - positivity == '0'  ->  {Reviews
# with user ratings 3} AND {'positive' nor 'negative'} is larger than 0.5 (ambiguous)
# - positivity == '-1'  ->  {Reviews
# with user ratings 4 and 5} AND {'negative'} sentiment is greater than {'positive' + 'neutral'}

df_sentiment['positivity'] = np.where((df_sentiment['RATING'] >= 4) & (df_sentiment['positive'] > 0.5), 1, 0)
df_sentiment['positivity'] = np.where((df_sentiment['RATING'] <= 2) & (df_sentiment['negative'] > 0.5), -1,
                                    df_sentiment['positivity'])
# results_df['hi']= results_df.loc[(results_df['RATING'] >= 4) & (results_df['positive'] > 0.5)]

# df_sentiment = df_sentiment.reindex(columns=['ID', 'negative', 'neutral', 'positive', 'RATING', 'positivity', 'POST_DATE',
#                                          'AUTHOR', 'REVIEW', 'review_clean', 'lemmatized', 'ngram2'])
df_sentiment.head()



In [25]:
df_sentiment['lemmatized_s'] = [', '.join(map(str, l)) for l in df_sentiment['lemmatized']]
df_sentiment['ngram2_s'] = [', '.join(map(str, l)) for l in df_sentiment['ngram2']]

d = df_sentiment.groupby(df_sentiment['positivity']).agg({'lemmatized_s': lambda x: ', '.join(x),
                                                      'ngram2_s': lambda x: ', '.join(x)})

lem_pos = d['lemmatized_s'][1]
lem_neu = d['lemmatized_s'][0]
lem_neg = d['lemmatized_s'][-1]

tags_pos = lem_pos.split(', ')  # Positivity [1]
tags_neu = lem_neu.split(', ')  # Positivity [0]
tags_neg = lem_neg.split(', ')  # Positivity [-1]
res_pos = {}
res_neu = {}
res_neg = {}


def word_count(tags, res):
    for i in tags:
        res[i] = tags.count(i)
    return res


res_pos = word_count(tags_pos, res_pos)
res_neu = word_count(tags_neu, res_neu)
res_neg = word_count(tags_neg, res_neg)

In [26]:
lemmatized_count = pd.DataFrame([res_pos, res_neu, res_neg]).astype('Int64').T.fillna(0)
lemmatized_count.columns = ['POS(1)', 'NEU(0)', 'NEG(-1)']
lemmatized_count = lemmatized_count.sort_values(by='POS(1)', ascending=False)
lemmatized_count.name = 'Word Count by Sentiment'
lemmatized_count  # sorted by Most Frequent in 'positive'


In [27]:
ngram2_pos = d['ngram2_s'][1]
ngram2_neu = d['ngram2_s'][0]
ngram2_neg = d['ngram2_s'][-1]

tags_bi_pos = ngram2_pos.split(', ')  # Positive Bi-gram
tags_bi_neu = ngram2_neu.split(', ')  # Neutral Bi-gram
tags_bi_neg = ngram2_neg.split(', ')  # Negative Bi-gram

res_bi_pos = {}
res_bi_neu = {}
res_bi_neg = {}


def word_count(tags, res):
    for i in tags:
        res[i] = tags.count(i)
    return res


res_bi_pos = word_count(tags_bi_pos, res_bi_pos)
res_bi_neu = word_count(tags_bi_neu, res_bi_neu)
res_bi_neg = word_count(tags_bi_neg, res_bi_neg)

In [28]:
bigram_count = pd.DataFrame([res_bi_pos, res_bi_neu, res_bi_neg]).astype('Int64').T.fillna(0)
bigram_count.columns = ['POS(1)', 'NEU(0)', 'NEG(-1)']
bigram_count = bigram_count.sort_values(by='POS(1)', ascending=False)
bigram_count.name = 'Bigram (2 adjacent words) Count by Sentiment'
bigram_count  # Sorted by Most Frequent in 'positive'

In [29]:
from wordcloud import WordCloud

# word = lem_pos
stopwords_c = ['vacuum', 'x000d', 'love','good','great','product','get']
wordcloud_pos = WordCloud(stopwords=stopwords_c, width=1000, height=500).generate(lem_pos)

plt.figure(figsize=(15, 8))
plt.imshow(wordcloud_pos)

plt.axis("off")
# plt.savefig("your_file_name"+".png", bbox_inches='tight')
plt.show()

In [30]:
wordcloud_neg = WordCloud(stopwords=stopwords_c, width=1000, height=500, colormap='RdPu').generate(lem_neg)
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud_neg)
plt.axis("off")
plt.show()


In [31]:
wordcloud_bi_neg = WordCloud(stopwords=stopwords_c, width=1000, height=500).generate_from_frequencies(res_bi_pos)
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud_bi_neg)
plt.axis("off")
plt.show()

In [32]:
# Keywords Extraction
df['REVIEW'] = df['REVIEW'].apply(
    lambda x: x.replace(":   ", ":").replace(":  ", ":").replace(": ", ":").replace(":\n\n", ": ").replace(":\n",": ").replace("\t", "").replace("\n-", "").replace("\n ", "\n"))

In [33]:
df = sdf_preprocessed.toPandas()
df.insert(0, 'ID', range(0 + len(df)))

print(df.head())


   ID        RETAILER      PRODUCT  RATING   POST_DATE REVIEWER_NAME  \
0   0        BEST BUY  LG A939KBGS       5  2022-05-26         2leo2   
1   1        BEST BUY  LG A939KBGS       5  2022-12-16     88XOHOX88   
2   2  THE HOME DEPOT  LG A939KBGS       4  2023-04-01    AA23250706   
3   3        BEST BUY  LG A939KBGS       5  2023-06-18        AaKomo   
4   4        BEST BUY  LG A939KBGS       4  2023-01-23         Aaron   

                             TITLE  \
0  Sweet, Light, Quiet, and Smooth   
1                       Love this!   
2                  It really sucks   
3             Worth the High Cost!   
4                    Great product   

                                             CONTENT  \
0  The LG vacuum is great, lightweight and cleans...   
1  Auto-empty leaves the canister like-new. Very ...   
2   We’ve had this vacuum a few weeks and are com...   
3  We love our LG and with the suction base, it m...   
4  Its great for what it’s made for. Hard floors....   

 

In [34]:
df['REVIEW'] = df['REVIEW'].apply(
    lambda x: x.replace(":   ", ":").replace(":  ", ":").replace(": ", ":").replace(":\n\n", ": ").replace(":\n",
                                                                                                           ": ").replace(
        "\t", "").replace("\n-", "").replace("\n ", "\n").replace("NaN", ""))

In [35]:
df['REVIEW']

0      Sweet, Light, Quiet, and Smooth The LG vacuum ...
1      Love this! Auto-empty leaves the canister like...
2      It really sucks  We’ve had this vacuum a few w...
3      Worth the High Cost! We love our LG and with t...
4      Great product Its great for what it’s made for...
                             ...                        
348      Very nice station comes with accessories tha...
349    Brand New Unit Came Broken I was very happy an...
350    This certainly doesn’t suck This vacuum is ama...
351    Love this vacuum!  This is my first wireless v...
352    Replaced my upright! My upright is too heavy f...
Name: REVIEW, Length: 353, dtype: object

In [36]:
# Separate REVIEW into PARAGRAPHS
def separate_paragraphs(review):
    para_list = []
    paragraphs = review.split('\n\n')
    para_list.extend(paragraphs)
    return para_list


df['PARAGRAPHS'] = df['REVIEW'].apply(lambda x: separate_paragraphs(x))


In [37]:
df['PARAGRAPHS'][0]
# Drop df columns: REVIEW
df = df.drop(columns=['RETAILER', 'PRODUCT', 'POST_DATE', 'REVIEWER_NAME', 'TITLE', 'CONTENT','REVIEW'])

# Make PARAGRAPHS list to string
# df['PARAGRAPHS'] = df['PARAGRAPHS'].agg(lambda x: ','.join(map(str, x)))
df.head()

Unnamed: 0,ID,RATING,PARAGRAPHS
0,0,5,"[Sweet, Light, Quiet, and Smooth The LG vacuum..."
1,1,5,[Love this! Auto-empty leaves the canister lik...
2,2,4,[It really sucks We’ve had this vacuum a few ...
3,3,5,[Worth the High Cost! We love our LG and with ...
4,4,4,[Great product Its great for what it’s made fo...


In [38]:
# explode the PARAGRAPHS Column
df = df.explode('PARAGRAPHS')
df = df.reset_index(drop=True)
# remove empty rows from PARAGRAPHS
df_paragraphs = df[df['PARAGRAPHS'] != '']

In [39]:
df_paragraphs

Unnamed: 0,ID,RATING,PARAGRAPHS
0,0,5,"Sweet, Light, Quiet, and Smooth The LG vacuum ..."
1,0,5,The mop attachment is not better than a steam ...
2,0,5,"Overall, the vacuum is a well needed product a..."
3,1,5,Love this! Auto-empty leaves the canister like...
4,2,4,It really sucks We’ve had this vacuum a few w...
...,...,...,...
487,350,5,LG has produced a package of additional attach...
488,350,5,The biggest highlight of the system is the aut...
489,350,5,"Overall, I highly recommend this vacuum! It’s ..."
490,351,5,Love this vacuum! This is my first wireless v...


In [40]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
from tqdm.notebook import tqdm


def polarity_scores_roberta(review):
    encoded_text = tokenizer(review, padding=True, truncation=True, max_length=512, return_tensors='pt')
    encoded_text
    # print(encoded_text)
    output = model(**encoded_text)
    # output
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores
    scores_dict = {
        'negative': scores[0],
        'neutral': scores[1],
        'positive': scores[2]
    }
    return scores_dict


# polarity_scores_roberta(review)
df_paragraphs.insert(1, 'P_ID', range(0 + len(df_paragraphs)))

df_paragraphs

res = {}
for i, row in tqdm(df_paragraphs.iterrows(), total=len(df_paragraphs)):
    try:
        text = row['PARAGRAPHS']
        myid = row['P_ID']
        # vader_results = sia.polarity_scores(text)
        roberta_result = polarity_scores_roberta(text)
        res[myid] = {**roberta_result}
    except RuntimeError:
        print(f'Broke for id {myid}')

df_para_sentiment = pd.DataFrame(res).T
df_para_sentiment = df_para_sentiment.reset_index().rename(columns={'index': 'P_ID'})
df_para_sentiment = df_para_sentiment.merge(df_paragraphs, how='left')
df_para_sentiment.head()

  0%|          | 0/487 [00:00<?, ?it/s]

Unnamed: 0,P_ID,negative,neutral,positive,ID,RATING,PARAGRAPHS
0,0,0.003129,0.037256,0.959615,0,5,"Sweet, Light, Quiet, and Smooth The LG vacuum ..."
1,1,0.042009,0.137438,0.820553,0,5,The mop attachment is not better than a steam ...
2,2,0.018682,0.179131,0.802187,0,5,"Overall, the vacuum is a well needed product a..."
3,3,0.002969,0.01579,0.981241,1,5,Love this! Auto-empty leaves the canister like...
4,4,0.022946,0.069928,0.907127,2,4,It really sucks We’ve had this vacuum a few w...


In [41]:

# copy results_df to keywords_df
df_keywords = df_para_sentiment.copy()
df_keywords['PARAGRAPHS'][0]
import string, re
from nltk import word_tokenize
from nltk.corpus import stopwords

for i in range(0, len(df_keywords['PARAGRAPHS'])):
    df_keywords['PARAGRAPHS'][i] = df_keywords['PARAGRAPHS'][i].translate(str.maketrans('', '', string.punctuation))
    df_keywords['PARAGRAPHS'][i] = df_keywords['PARAGRAPHS'][i].replace('\n', '. ')
    df_keywords['PARAGRAPHS'][i] = df_keywords['PARAGRAPHS'][i].lower()
    # keywords_df['PARAGRAPHS'][i] = re.sub("['\"]","",keywords_df['PARAGRAPHS'][i])
    for j in re.findall('"([^"]*)"', df_keywords['PARAGRAPHS'][i]):
        df_keywords['PARAGRAPHS'][i] = df_keywords['PARAGRAPHS'][i].replace('"{}"'.format(j), j.replace(' ', '_'))



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_keywords['PARAGRAPHS'][i] = df_keywords['PARAGRAPHS'][i].translate(str.maketrans('', '', string.punctuation))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_keywords['PARAGRAPHS'][i] = df_keywords['PARAGRAPHS'][i].replace('\n', '. ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_keywords['PARAGRAPHS'][i] = df_keywords['PARAGRAPHS'][i].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pa

In [42]:
df_keywords['PARAGRAPHS'][0]

'sweet light quiet and smooth the lg vacuum is great lightweight and cleans not only hard floor and carpet but also itself the battery lasts about 15 hours which should be enough to clean the it comes with a second battery that can be charged while the primary battery is being used the charging dock however has a place to store and charge both batteries at the same time if needed  all accessories have storage space in the base so you dont have to find a place for them has awesome wifi features to detect cleaning efficiency'

In [43]:
df_keywords['KEYWORD'] = df_keywords['PARAGRAPHS'].apply(lambda x: word_tokenize(x))
df_keywords['KEYWORD']

0      [sweet, light, quiet, and, smooth, the, lg, va...
1      [the, mop, attachment, is, not, better, than, ...
2      [overall, the, vacuum, is, a, well, needed, pr...
3      [love, this, autoempty, leaves, the, canister,...
4      [it, really, sucks, we, ’, ve, had, this, vacu...
                             ...                        
482    [lg, has, produced, a, package, of, additional...
483    [the, biggest, highlight, of, the, system, is,...
484    [overall, i, highly, recommend, this, vacuum, ...
485    [love, this, vacuum, this, is, my, first, wire...
486    [replaced, my, upright, my, upright, is, too, ...
Name: KEYWORD, Length: 487, dtype: object

In [44]:
english_stopwords = stopwords.words('english')
for i in range(0, len(df_keywords['KEYWORD'])):
    df_keywords['KEYWORD'][i] = [w for w in df_keywords['KEYWORD'][i] if w.lower() not in english_stopwords]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_keywords['KEYWORD'][i] = [w for w in df_keywords['KEYWORD'][i] if w.lower() not in english_stopwords]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_keywords['KEYWORD'][i] = [w for w in df_keywords['KEYWORD'][i] if w.lower() not in english_stopwords]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_keywords['KEYWORD'][i] = [w for w in df_keywords['KEYWORD'][i] if w.lower() not in english_stopwords]
A value is trying to be set on a copy of a slice from a DataF

In [45]:
# remove duplicate df_keywords['KEYWORD']

df_keywords['KEYWORD'] = df_keywords['KEYWORD'].apply(lambda x: list(dict.fromkeys(x)))
df_keywords['KEYWORD']

0      [sweet, light, quiet, smooth, lg, vacuum, grea...
1      [mop, attachment, better, steam, job, motor, q...
2      [overall, vacuum, well, needed, product, conve...
3      [love, autoempty, leaves, canister, likenew, i...
4      [really, sucks, ’, vacuum, weeks, completely, ...
                             ...                        
482    [lg, produced, package, additional, attachment...
483    [biggest, highlight, system, automatic, dispen...
484    [overall, highly, recommend, vacuum, ’, sleek,...
485    [love, vacuum, first, wireless, life, corded, ...
486    [replaced, upright, heavy, use, cleaning, anot...
Name: KEYWORD, Length: 487, dtype: object

In [46]:
from rake_nltk import Metric, Rake


r = Rake(include_repeated_phrases=False,
         min_length=2,
         ranking_metric=Metric.WORD_DEGREE)
keywords = []
for i in range(0, len(df_keywords['PARAGRAPHS'])):
    keyword = r.extract_keywords_from_text(df_keywords['PARAGRAPHS'][i])
    keyword = r.get_ranked_phrases()
    keywords.append(keyword)
df_keywords['KEYWORDS'] = keywords
len(keywords)
df_keywords['KEYWORD'][0]
df_keywords['KEYWORDS'][0]
# save to csv
# df_keywords.to_csv('keywords_df_RAKE.csv', encoding='utf-8-sig', index=False)

['sweet light quiet',
 'detect cleaning efficiency',
 'charging dock however',
 'awesome wifi features',
 'second battery',
 'primary battery',
 'battery lasts',
 'storage space',
 'lg vacuum',
 'hard floor',
 'great lightweight',
 '15 hours']

In [47]:
df_keywords

Unnamed: 0,P_ID,negative,neutral,positive,ID,RATING,PARAGRAPHS,KEYWORD,KEYWORDS
0,0,0.003129,0.037256,0.959615,0,5,sweet light quiet and smooth the lg vacuum is ...,"[sweet, light, quiet, smooth, lg, vacuum, grea...","[sweet light quiet, detect cleaning efficiency..."
1,1,0.042009,0.137438,0.820553,0,5,the mop attachment is not better than a steam ...,"[mop, attachment, better, steam, job, motor, q...","[self cleaning feature saves, lasts around 20 ..."
2,2,0.018682,0.179131,0.802187,0,5,overall the vacuum is a well needed product an...,"[overall, vacuum, well, needed, product, conve...","[well needed product, prevent major maintenanc..."
3,3,0.002969,0.015790,0.981241,1,5,love this autoempty leaves the canister likene...,"[love, autoempty, leaves, canister, likenew, i...","[canister likenew, autoempty leaves]"
4,4,0.022946,0.069928,0.907127,2,4,it really sucks we’ve had this vacuum a few w...,"[really, sucks, ’, vacuum, weeks, completely, ...","[limited time helps us clean faster, battery h..."
...,...,...,...,...,...,...,...,...,...
482,482,0.003889,0.060876,0.935235,350,5,lg has produced a package of additional attach...,"[lg, produced, package, additional, attachment...","[pet hair attachment, mattress attachment, mat..."
483,483,0.098048,0.179565,0.722387,350,5,the biggest highlight of the system is the aut...,"[biggest, highlight, system, automatic, dispen...","[getting everything, getting dust, automatical..."
484,484,0.001873,0.007052,0.991075,350,5,overall i highly recommend this vacuum it’s sl...,"[overall, highly, recommend, vacuum, ’, sleek,...","[sleek lightweight easy, keeps everything clea..."
485,485,0.012779,0.032729,0.954492,351,5,love this vacuum this is my first wireless va...,"[love, vacuum, first, wireless, life, corded, ...","[corded vacuum like, rechargeable stand vacuum..."


In [59]:
from pke.unsupervised import YAKE

stopwords = nltk.corpus.stopwords.words('english')

document = "Machine learning (ML) is the study of computer algorithms that improve automatically through experience. It is seen as a subset of artificial intelligence."

# 1. Create YAKE keyword extractor
extractor = YAKE()

# 2. Load document
extractor.load_document(input=document,
                        language='en',
                        normalization=None)

# stoplist = stopwords.words('english')



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [61]:
# 3. Generate candidate 1-gram and 2-gram keywords
extractor.candidate_selection(n=2,)

# 4. Calculate scores for the candidate keywords
extractor.candidate_weighting(window=2,
                              
                              use_stems=False)

# 5. Select 10 highest ranked keywords
# Remove redundant keywords with similarity above 80%
key_phrases = extractor.get_n_best(n=10, threshold=0.8)
print(key_phrases)

[('machine learning', 0.033894667362119324), ('computer algorithms', 0.05534498969087407), ('improve automatically', 0.05534498969087407), ('machine', 0.14315718557674914), ('learning', 0.2290033993608783), ('study', 0.2290033993608783), ('computer', 0.2290033993608783), ('algorithms', 0.2290033993608783), ('improve', 0.2290033993608783), ('automatically', 0.2290033993608783)]


In [66]:
import yake

# document = "Machine learning (ML) is the study of computer algorithms that improve automatically through experience. It is seen as a subset of artificial intelligence."
# kw_extractor = yake.KeywordExtractor()
# keywords = kw_extractor.extract_keywords(document)

# for kw in keywords:
#     print(kw)

language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 20

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold,
                                            dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords,
                                            features=None)
keywords = custom_kw_extractor.extract_keywords(df_keywords['KEYWORDS'])

for kw in keywords:
    print(kw)

1      [self cleaning feature saves, lasts around 20 ...
2      [well needed product, prevent major maintenanc...
3                   [canister likenew, autoempty leaves]
4      [limited time helps us clean faster, battery h...
                             ...                        
482    [pet hair attachment, mattress attachment, mat...
483    [getting everything, getting dust, automatical...
484    [sleek lightweight easy, keeps everything clea...
485    [corded vacuum like, rechargeable stand vacuum...
486    [change filters also, every time, empty ’, ano...
Name: KEYWORDS, Length: 487, dtype: object' 


In [63]:
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 20

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
keywords = custom_kw_extractor.extract_keywords(text)

for kw in keywords:
    print(kw)

('Replaced my upright', 0.037084618279794264)
('Replaced', 0.12696931063105557)
('upright', 0.14081563009048734)
('change filters', 0.17375126351151512)
('return to port', 0.17375126351151512)
('time', 0.24853867623174491)
('choir', 0.33559480906340394)
('filters', 0.33559480906340394)
('port', 0.33559480906340394)
('heavy', 0.44109960011701166)
('cleaning', 0.44109960011701166)
('empty', 0.44109960011701166)
('change', 0.44109960011701166)
('empties', 0.44109960011701166)
('return', 0.44109960011701166)
('time to change', 0.513733510979686)
('time you return', 0.513733510979686)


In [67]:
spark.stop()