In [18]:
import pandas as pd
import re
import string

In [None]:
import sys
sys.path.append("..")  # Path to the parent directory of the src folder.

In [None]:
# Import text preprocessing functions: 
# - clean_text: removes unwanted characters like numbers, punctuation, and special characters
# - tokenize_and_remove_stopwords: splits text into tokens and removes stopwords
# - lemmatize_text: reduces words to their base or root form
from src.text_preprocessing import clean_text, tokenize_and_remove_stopwords, lemmatize_text

In [None]:
# import dataframe
df=pd.read_csv('../data/cleaned_data.csv')

In [25]:
df.head()

Unnamed: 0,id,asins,brand,categories,keys,name,prices,reviews.rating,reviews.text,reviews.title
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!"
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,Allow me to preface this with a little history...,One Simply Could Not Ask For More
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",4.0,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I bought one of the first Paperwhites and have...,Love / Hate relationship
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I have to say upfront - I don't like coroporat...,I LOVE IT


In [None]:
# create a copy of dataset
data=df.copy()

In [None]:
# Clean the text by removing numbers, punctuation, special characters, and extra spaces
data['Cleaned_Review'] = data['reviews.text'].apply(clean_text)
# Tokenize the cleaned text and remove stopwords
data['Tokenized_NoStopwords_review'] = data['Cleaned_Review'].apply(tokenize_and_remove_stopwords)
# Apply lemmatization to the tokens to get their base forms
data['Lemmatized_Review'] = data['Tokenized_NoStopwords_review'].apply(lemmatize_text)

In [None]:
# same for 'reviews.title' column
data['Cleaned_title'] = data['reviews.title'].apply(clean_text)
data['Tokenized_NoStopwords_title'] = data['Cleaned_title'].apply(tokenize_and_remove_stopwords)
data['Lemmatized_title'] = data['Tokenized_NoStopwords_title'].apply(lemmatize_text)

In [29]:
data.head()

Unnamed: 0,id,asins,brand,categories,keys,name,prices,reviews.rating,reviews.text,reviews.title,Cleaned_Review,Tokenized_NoStopwords_review,Lemmatized_Review,Cleaned_title,Tokenized_NoStopwords_title,Lemmatized_title
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",i initially had trouble deciding between the p...,"[initially, trouble, deciding, paperwhite, voy...","[initially, trouble, decide, paperwhite, voyag...",paperwhite voyage no regrets,"[paperwhite, voyage, regrets]","[paperwhite, voyage, regret]"
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,Allow me to preface this with a little history...,One Simply Could Not Ask For More,allow me to preface this with a little history...,"[allow, preface, little, history, casual, read...","[allow, preface, little, history, casual, read...",one simply could not ask for more,"[one, simply, could, ask]","[one, simply, could, ask]"
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",4.0,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader,i am enjoying it so far great for reading had ...,"[enjoying, far, great, reading, original, fire...","[enjoy, far, great, read, original, fire, sinc...",great for those that just want an ereader,"[great, want, ereader]","[great, want, ereader]"
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I bought one of the first Paperwhites and have...,Love / Hate relationship,i bought one of the first paperwhites and have...,"[bought, one, first, paperwhites, pleased, con...","[buy, one, first, paperwhite, please, constant...",love hate relationship,"[love, hate, relationship]","[love, hate, relationship]"
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I have to say upfront - I don't like coroporat...,I LOVE IT,i have to say upfront i dont like coroporate h...,"[say, upfront, dont, like, coroporate, hermeti...","[say, upfront, do, not, like, coroporate, herm...",i love it,[love],[love]


In [None]:
import re

# Use a regex to extract the 'amountMax' value from the 'prices' column
# - The regex '"amountMax":\s*(\d+)' captures the numeric value after the 'amountMax' key
# - Apply this extraction to each row in 'prices', handling null values with a conditional check
data['price'] = data['prices'].apply(lambda x: re.search(r'"amountMax":\s*(\d+)', x).group(1) if pd.notnull(x) else None)

# Convert the extracted 'price' values to float for numerical analysis
data['price'] = data['price'].astype(float)


In [31]:
data.head()

Unnamed: 0,id,asins,brand,categories,keys,name,prices,reviews.rating,reviews.text,reviews.title,Cleaned_Review,Tokenized_NoStopwords_review,Lemmatized_Review,Cleaned_title,Tokenized_NoStopwords_title,Lemmatized_title,price
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",i initially had trouble deciding between the p...,"[initially, trouble, deciding, paperwhite, voy...","[initially, trouble, decide, paperwhite, voyag...",paperwhite voyage no regrets,"[paperwhite, voyage, regrets]","[paperwhite, voyage, regret]",139.0
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,Allow me to preface this with a little history...,One Simply Could Not Ask For More,allow me to preface this with a little history...,"[allow, preface, little, history, casual, read...","[allow, preface, little, history, casual, read...",one simply could not ask for more,"[one, simply, could, ask]","[one, simply, could, ask]",139.0
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",4.0,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader,i am enjoying it so far great for reading had ...,"[enjoying, far, great, reading, original, fire...","[enjoy, far, great, read, original, fire, sinc...",great for those that just want an ereader,"[great, want, ereader]","[great, want, ereader]",139.0
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I bought one of the first Paperwhites and have...,Love / Hate relationship,i bought one of the first paperwhites and have...,"[bought, one, first, paperwhites, pleased, con...","[buy, one, first, paperwhite, please, constant...",love hate relationship,"[love, hate, relationship]","[love, hate, relationship]",139.0
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",kindlepaperwhite/b00qjdu3ky,Kindle Paperwhite,"[{""amountMax"":139.99,""amountMin"":139.99,""curre...",5.0,I have to say upfront - I don't like coroporat...,I LOVE IT,i have to say upfront i dont like coroporate h...,"[say, upfront, dont, like, coroporate, hermeti...","[say, upfront, do, not, like, coroporate, herm...",i love it,[love],[love],139.0


In [None]:
# checking for dataframe informations
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1551 entries, 0 to 1550
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            1551 non-null   object 
 1   asins                         1551 non-null   object 
 2   brand                         1551 non-null   object 
 3   categories                    1551 non-null   object 
 4   keys                          1551 non-null   object 
 5   name                          1551 non-null   object 
 6   prices                        1551 non-null   object 
 7   reviews.rating                1551 non-null   float64
 8   reviews.text                  1551 non-null   object 
 9   reviews.title                 1551 non-null   object 
 10  Cleaned_Review                1551 non-null   object 
 11  Tokenized_NoStopwords_review  1551 non-null   object 
 12  Lemmatized_Review             1551 non-null   object 
 13  Cle

In [33]:
# save the preprocessed dataframe
data.to_csv('../data/preprocessed_data.csv', index=False)