In [1]:
import pandas as pd
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer

# Create processed folder if missing
os.makedirs("../data/processed", exist_ok=True)

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load clean dataset
df = pd.read_csv("../data/processed/clean_reviews.csv")
print("Dataset loaded. Sample:")
print(df.head())

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = ToktokTokenizer()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = tokenizer.tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['clean_review'] = df['review_text'].apply(preprocess_text)

print("\nSample cleaned reviews:")
print(df[['review_text', 'clean_review', 'sentiment']].head())

# Save preprocessed data
df.to_csv("../data/processed/preprocessed_reviews.csv", index=False)
print("Preprocessed data saved as: ../data/processed/preprocessed_reviews.csv")


[nltk_data] Downloading package stopwords to c:\Users\radhi\AppData\Lo
[nltk_data]     cal\Programs\Python\Python311\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to c:\Users\radhi\AppData\Loca
[nltk_data]     l\Programs\Python\Python311\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Dataset loaded. Sample:
                                         review_text  rating sentiment
0  Absolutely wonderful - silky and sexy and comf...       4  positive
1  Love this dress!  it's sooo pretty.  i happene...       5  positive
2  I love, love, love this jumpsuit. it's fun, fl...       5  positive
3  This shirt is very flattering to all due to th...       5  positive
4  I love tracy reese dresses, but this one is no...       2  negative

Sample cleaned reviews:
                                         review_text  \
0  Absolutely wonderful - silky and sexy and comf...   
1  Love this dress!  it's sooo pretty.  i happene...   
2  I love, love, love this jumpsuit. it's fun, fl...   
3  This shirt is very flattering to all due to th...   
4  I love tracy reese dresses, but this one is no...   

                                        clean_review sentiment  
0        absolutely wonderful silky sexy comfortable  positive  
1  love dress sooo pretty happened find store im ...  posi