In [124]:
import pandas as pd
import re
import string

In [None]:
import sys
sys.path.append("..")  # Path to the parent directory of the src folder.

In [74]:
# Import text preprocessing functions: 
# - clean_text: removes unwanted characters like numbers, punctuation, and special characters
# - tokenize_and_remove_stopwords: splits text into tokens and removes stopwords
# - lemmatize_text: reduces words to their base or root form
from src.text_preprocessing import clean_text, tokenize_and_remove_stopwords, lemmatize_text

In [None]:
# import dataframe
df=pd.read_csv('../data/cleaned_data.csv')

In [25]:
df.head()

Unnamed: 0,id,asins,brand,categories,keys,name,prices,reviews.rating,reviews.text,reviews.title
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!"
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,Allow me to preface this with a little history...,One Simply Could Not Ask For More
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",4.0,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I bought one of the first Paperwhites and have...,Love / Hate relationship
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I have to say upfront - I don't like coroporat...,I LOVE IT


In [None]:
# create a copy of dataset
data=df.copy()

In [None]:
# Clean the text by removing numbers, punctuation, special characters, and extra spaces
data['Cleaned_Review'] = data['reviews.text'].apply(clean_text)
# Tokenize the cleaned text and remove stopwords
data['Tokenized_NoStopwords_review'] = data['Cleaned_Review'].apply(tokenize_and_remove_stopwords)
# Apply lemmatization to the tokens to get their base forms
data['Lemmatized_Review'] = data['Tokenized_NoStopwords_review'].apply(lemmatize_text)

In [None]:
# same for 'reviews.title' column
data['Cleaned_title'] = data['reviews.title'].apply(clean_text)
data['Tokenized_NoStopwords_title'] = data['Cleaned_title'].apply(tokenize_and_remove_stopwords)
data['Lemmatized_title'] = data['Tokenized_NoStopwords_title'].apply(lemmatize_text)

In [29]:
data.head()

Unnamed: 0,id,asins,brand,categories,keys,name,prices,reviews.rating,reviews.text,reviews.title,Cleaned_Review,Tokenized_NoStopwords_review,Lemmatized_Review,Cleaned_title,Tokenized_NoStopwords_title,Lemmatized_title
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",i initially had trouble deciding between the p...,"[initially, trouble, deciding, paperwhite, voy...","[initially, trouble, decide, paperwhite, voyag...",paperwhite voyage no regrets,"[paperwhite, voyage, regrets]","[paperwhite, voyage, regret]"
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,Allow me to preface this with a little history...,One Simply Could Not Ask For More,allow me to preface this with a little history...,"[allow, preface, little, history, casual, read...","[allow, preface, little, history, casual, read...",one simply could not ask for more,"[one, simply, could, ask]","[one, simply, could, ask]"
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",4.0,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader,i am enjoying it so far great for reading had ...,"[enjoying, far, great, reading, original, fire...","[enjoy, far, great, read, original, fire, sinc...",great for those that just want an ereader,"[great, want, ereader]","[great, want, ereader]"
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I bought one of the first Paperwhites and have...,Love / Hate relationship,i bought one of the first paperwhites and have...,"[bought, one, first, paperwhites, pleased, con...","[buy, one, first, paperwhite, please, constant...",love hate relationship,"[love, hate, relationship]","[love, hate, relationship]"
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I have to say upfront - I don't like coroporat...,I LOVE IT,i have to say upfront i dont like coroporate h...,"[say, upfront, dont, like, coroporate, hermeti...","[say, upfront, do, not, like, coroporate, herm...",i love it,[love],[love]


In [None]:
import re

# Use a regex to extract the 'amountMax' value from the 'prices' column
# - The regex '"amountMax":\s*(\d+)' captures the numeric value after the 'amountMax' key
# - Apply this extraction to each row in 'prices', handling null values with a conditional check
data['price'] = data['prices'].apply(lambda x: re.search(r'"amountMax":\s*(\d+)', x).group(1) if pd.notnull(x) else None)

# Convert the extracted 'price' values to float for numerical analysis
data['price'] = data['price'].astype(float)


In [31]:
data.head()

Unnamed: 0,id,asins,brand,categories,keys,name,prices,reviews.rating,reviews.text,reviews.title,Cleaned_Review,Tokenized_NoStopwords_review,Lemmatized_Review,Cleaned_title,Tokenized_NoStopwords_title,Lemmatized_title,price
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",i initially had trouble deciding between the p...,"[initially, trouble, deciding, paperwhite, voy...","[initially, trouble, decide, paperwhite, voyag...",paperwhite voyage no regrets,"[paperwhite, voyage, regrets]","[paperwhite, voyage, regret]",139.0
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,Allow me to preface this with a little history...,One Simply Could Not Ask For More,allow me to preface this with a little history...,"[allow, preface, little, history, casual, read...","[allow, preface, little, history, casual, read...",one simply could not ask for more,"[one, simply, could, ask]","[one, simply, could, ask]",139.0
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",4.0,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader,i am enjoying it so far great for reading had ...,"[enjoying, far, great, reading, original, fire...","[enjoy, far, great, read, original, fire, sinc...",great for those that just want an ereader,"[great, want, ereader]","[great, want, ereader]",139.0
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I bought one of the first Paperwhites and have...,Love / Hate relationship,i bought one of the first paperwhites and have...,"[bought, one, first, paperwhites, pleased, con...","[buy, one, first, paperwhite, please, constant...",love hate relationship,"[love, hate, relationship]","[love, hate, relationship]",139.0
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I have to say upfront - I don't like coroporat...,I LOVE IT,i have to say upfront i dont like coroporate h...,"[say, upfront, dont, like, coroporate, hermeti...","[say, upfront, do, not, like, coroporate, herm...",i love it,[love],[love],139.0


In [None]:
# checking for dataframe informations
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1551 entries, 0 to 1550
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            1551 non-null   object 
 1   asins                         1551 non-null   object 
 2   brand                         1551 non-null   object 
 3   categories                    1551 non-null   object 
 4   keys                          1551 non-null   object 
 5   name                          1551 non-null   object 
 6   prices                        1551 non-null   object 
 7   reviews.rating                1551 non-null   float64
 8   reviews.text                  1551 non-null   object 
 9   reviews.title                 1551 non-null   object 
 10  Cleaned_Review                1551 non-null   object 
 11  Tokenized_NoStopwords_review  1551 non-null   object 
 12  Lemmatized_Review             1551 non-null   object 
 13  Cle

In [None]:
# save the updated preprocessed dataframe
data.to_csv('../data/preprocessed_data.csv', index=False)

In [36]:
import sys
sys.path.append("..")  # Path to the parent directory of the src folder.

In this part of the project, several text preprocessing steps were performed to clean and prepare the data for analysis. The text was converted to lowercase, with URLs, mentions, punctuation, and numbers removed, along with unnecessary whitespace, to standardize the text. Sentences were tokenized into individual words, and stopwords were removed to reduce noise. Lemmatization was applied to normalize words to their canonical forms (e.g., "running" to "run"), ensuring consistency across the dataset. Additionally, relevant numerical information, such as maximum prices, was extracted using regular expressions to enhance the dataset with meaningful features for further analysis.

In [123]:
from importlib import reload
from src import text_preprocessing
R=reload(text_preprocessing)

In [None]:
# Importing the functions `identify_rare_words` and `remove_rare_words` from the `text_preprocessing` module.
# - `identify_rare_words`: This function identifies words that appear infrequently in the dataset, 
#   which can be considered as rare words based on a defined threshold.
# - `remove_rare_words`: This function removes the identified rare words from the dataset, 
#   reducing noise and improving the focus on more meaningful and commonly occurring words.

from src.text_preprocessing import identify_rare_words,remove_rare_words

In [None]:
# using identify_rare_words function to extract words with freq=1
rare_words = identify_rare_words(data['Lemmatized_Review'], threshold=1)

# Output
print(f"rare words (threshold=1) : {rare_words[:50]}")  # Limited at 50 first rows for lisiblity
print(f"count of rare words : {len(rare_words)}")

rare words (threshold=1) : ['pry', 'fingersfor', 'sundry', 'logistical', 'usability', 'kindleof', 'critique', 'ah', 'bookbub', 'alert', 'accumulative', 'positively', 'glacial', 'consensus', 'flakey', 'upfront', 'coroporate', 'hermetically', 'buti', 'itso', 'screenlight', 'disperse', 'outthe', 'paperwhites', 'guyi', 'funcion', 'quote', 'remains', 'lifeso', 'translate', 'simplify', 'moneytotally', 'loverill', 'mommy', 'feeding', 'cluster', 'growth', 'spurt', 'sidelyingbreastfeedingposition', 'disinfect', 'whenever', 'meanand', 'culture', 'usa', 'earlieri', 'modelpaper', 'canadian', 'belowthis', 'hurry', 'endsummary']
count of rare words : 2483


In [None]:
# Delete rare words with freq=1 from tokens lists
data['Filtered_ReviewTokens'] = remove_rare_words(data['Lemmatized_Review'], rare_words)

# print the output
print(data['Filtered_ReviewTokens'].head())

0    [initially, trouble, decide, paperwhite, voyag...
1    [allow, preface, little, history, casual, read...
2    [enjoy, far, great, read, original, fire, sinc...
3    [buy, one, first, paperwhite, please, constant...
4    [say, do, not, like, closed, stuff, like, anyt...
Name: Filtered_ReviewTokens, dtype: object


In [None]:
# `identify_misspelled_words`: This function detects words in the dataset that are likely misspelled, 
# helping to identify inconsistencies in the text data.
# `correct_spelling`: This function corrects the identified misspelled words, 
# ensuring consistency and improving the quality of the text data for analysis.

reload(text_preprocessing)
from src.text_preprocessing import identify_misspelled_words, correct_spelling

In [120]:
# Identify misspelled words
misspelled_words = identify_misspelled_words(data['Lemmatized_title'])

# Visualize the first few misspelled words
print(f"Sample of misspelled words : {list(misspelled_words)[:50]}")
print(f"Total number of misspelled words : {len(misspelled_words)}")

Sample of misspelled words : ['yr', 'nifty', 'plaisir', 'ips', 'tab', 'wireless', 'luv', 'sooo', 'helpp', 'goto', 'builtin', 'oem', 'amost', 'hulu', 'wow', 'alexea', 'itwm', 'alt', 'comcast', 'hbo', 'kids', 'amazion', 'loos', 'refurb', 'epub', 'inexpensive', 'lineup', 'alexa', 'serviceim', 'trs', 'browse', 'tappy', 'grandma', 'echos', 'portability', 'quirk', 'dosent', 'bt', 'unusable', 'exceptionalan', 'audio', 'complment', 'redux', 'usb', 'warner', 'skip', 'apri', 'hdx', 'crap', 'importantan']
Total number of misspelled words : 133


In [121]:
"""# Identify misspelled words
misspelled_words = identify_misspelled_words(data['Filtered_ReviewTokens'])

# Display the first few misspelled words
print(f"Misspelled words (sample): {list(misspelled_words)[:50]}")
print(f"Total number of misspelled words: {len(misspelled_words)}")"""

'# Identify misspelled words\nmisspelled_words = identify_misspelled_words(data[\'Filtered_ReviewTokens\'])\n\n# Display the first few misspelled words\nprint(f"Misspelled words (sample): {list(misspelled_words)[:50]}")\nprint(f"Total number of misspelled words: {len(misspelled_words)}")'

In [None]:
# Apply spelling correction
data['Corrected_filtred_Tokens'] = correct_spelling(data['Filtered_ReviewTokens'])

# Display a preview of the corrected tokens
print(data[['Filtered_ReviewTokens', 'Corrected_filtred_Tokens']].head())


                               Filtered_ReviewTokens  \
0  [initially, trouble, decide, paperwhite, voyag...   
1  [allow, preface, little, history, casual, read...   
2  [enjoy, far, great, read, original, fire, sinc...   
3  [buy, one, first, paperwhite, please, constant...   
4  [say, do, not, like, closed, stuff, like, anyt...   

                            Corrected_filtred_Tokens  
0  [initially, trouble, decide, paperwhite, voyag...  
1  [allow, preface, little, history, casual, read...  
2  [enjoy, far, great, read, original, fire, sinc...  
3  [buy, one, first, paperwhite, please, constant...  
4  [say, do, not, like, closed, stuff, like, anyt...  


In [None]:
# save the updated preprocessed dataframe
data.to_csv('../data/preprocessed_data.csv', index=False)

In [118]:
import sys
sys.path.append("..")  # Path to the parent directory of the src folder.

In [110]:
reload(text_preprocessing)
from src.text_preprocessing import call_gensim_bigram_trigram

In [122]:
import os

if os.path.exists(script_path):
    print(f"file {script_path} exist.")
else:
    print(f"file {script_path} does not exist. Verify path.")

file ../src/gensim_functions.py exist.


In [None]:
import sys
sys.path.append("..")  # Path to the parent directory of the src folder.
script_path='../src/gensim_functions.py'
# Apply the function on corrected tokens
data['Enriched_Tokens'] = call_gensim_bigram_trigram(data['Corrected_filtred_Tokens'],script_path)

# results
print(data[['Corrected_filtred_Tokens', 'Enriched_Tokens']].head())


                            Corrected_filtred_Tokens  \
0  [initially, trouble, decide, paperwhite, voyag...   
1  [allow, preface, little, history, casual, read...   
2  [enjoy, far, great, read, original, fire, sinc...   
3  [buy, one, first, paperwhite, please, constant...   
4  [say, do, not, like, closed, stuff, like, anyt...   

                                     Enriched_Tokens  
0  [initially, trouble, decide, paperwhite_voyage...  
1  [allow, preface, little, history, casual, read...  
2  [enjoy, far, great, read, original, fire, sinc...  
3  [buy, one, first, paperwhite, please, constant...  
4  [say_do_not, like, closed, stuff, like, anythi...  


In the advanced preprocessing stage, additional steps were performed to enhance the quality of the text data. Words with a frequency of only one were filtered out, as they provide little to no value for analysis. Misspelled words were then identified and corrected using an integrated dictionary to ensure consistency and accuracy. Following this, bigrams and trigrams were generated to capture meaningful multi-word phrases, improving the representation of context and relationships within the text. These steps further refined the dataset, making it more suitable for downstream analysis.

In [None]:
# save the updated preprocessed dataframe
data.to_csv('../data/preprocessed_data.csv', index=False)