In [1]:
import re 
import os
import nltk
import numpy as np 
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


In [2]:
df = pd.read_csv('../../data/raw/Reviews.csv',encoding='utf-8')

In [3]:
df.head(4)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...


In [4]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [5]:
# List of columns to keep
dataset = ['ProductId', 'Score', 'Summary', 'Text']

# Create a new DataFrame with only the selected columns
df = df[dataset]




In [6]:
df.head(3)

Unnamed: 0,ProductId,Score,Summary,Text
0,B001E4KFG0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,4,"""Delight"" says it all",This is a confection that has been around a fe...


In [7]:
df.isnull().sum()

ProductId     0
Score         0
Summary      27
Text          0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Score
count,568454.0
mean,4.183199
std,1.310436
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [9]:
df.shape

(568454, 4)

In [10]:
df = df.dropna(subset=['Summary'])


In [11]:

# 0 = Negative (1,2), 1 = Neutral (3), 2 = Positive (4,5)
df['Score'] = df['Score'].apply(lambda x: 0 if x in [1, 2] else (1 if x == 3 else 2))


In [12]:
df.head(2)

Unnamed: 0,ProductId,Score,Summary,Text
0,B001E4KFG0,2,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,0,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [13]:
#Renaming Score as Rating and combining Summary and Text 
df.rename(columns={'Score': 'Rating'}, inplace=True)
df['Review'] = df['Summary'].fillna('') + '. ' + df['Text'].fillna('')


In [16]:
# Sample 5000 rows from the existing DataFrame and save back to df
df = df.sample(n=5000, random_state=42).reset_index(drop=True)


In [17]:
df.shape

(5000, 5)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
custom_stopwords = {'a', 'an', 'the', 'and', 'but', 'is', 'are', 'was', 'were', 'in', 'on', 'at'}
stop_words = custom_stopwords
use_lemmatization = False

# Text Preprocessing Pipeline
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    tokens = nltk.word_tokenize(text) if 'punkt' in nltk.data.find('tokenizers/punkt') else text.split()
    
    # Stopword removal
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization (if available) or simple stemming
    if use_lemmatization:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    else:
        stemmer = nltk.stem.PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)



df['processed_text'] = df['Review'].apply(preprocess_text)

# 5. Show Results
print("\nBefore and After Preprocessing:")
print(df[['Review', 'processed_text']].head())



Before and After Preprocessing:
                                              Review  \
0  I like these!. These are actually very tasty. ...   
1  Good but subjectively not 5 star. I realize th...   
2  Lipton Cup A Soup, Spring Vegetable.4 oz. This...   
3  Suited to its purpose, if not quite its goal.....   
4  Tastes artificial!. I was willing to give this...   

                                      processed_text  
0  i like these these actual veri tasti pure pota...  
1  good subject not star i realiz that tast matte...  
2  lipton cup soup spring veget oz thi one of my ...  
3  suit to it purpos if not quit it goal if you l...  
4  tast artifici i will to give thi chanc even af...  


In [19]:
os.makedirs('data/clean', exist_ok=True)

# Save cleaned data
df.to_csv('data/clean/clean_data.csv', index=False)

# Verify output
print("First 3 processed samples:")
print(df[['Review', 'processed_text']].head(3))
print(f"\nFile saved: {os.path.exists('data/clean/clean_data.csv')}")


First 3 processed samples:
                                              Review  \
0  I like these!. These are actually very tasty. ...   
1  Good but subjectively not 5 star. I realize th...   
2  Lipton Cup A Soup, Spring Vegetable.4 oz. This...   

                                      processed_text  
0  i like these these actual veri tasti pure pota...  
1  good subject not star i realiz that tast matte...  
2  lipton cup soup spring veget oz thi one of my ...  

File saved: True


In [20]:
clean_data = pd.read_csv('../../data/clean/clean_data.csv',encoding='utf-8')

In [21]:
clean_data.head(2)

Unnamed: 0,ProductId,Rating,Summary,Text,Review,processed_text
0,B0088YBUOU,2,I like these!,These are actually very tasty. Pure potatoes ...,I like these!. These are actually very tasty. ...,i like these these actual veri tasti pure pota...
1,B000Q75354,2,Good but subjectively not 5 star,I realize that taste is a matter of personal p...,Good but subjectively not 5 star. I realize th...,good subject not star i realiz that tast matte...
