In [2]:
#data pre-processing
#import dataset
import os
import pandas as pd

#load dataset
file_path = 'Data/Raw/Womens Clothing E-Commerce Reviews.csv'
if os.path.exists(file_path):
	df = pd.read_csv(file_path)
	print(df.head())
else:
	print(f"File not found: {file_path}")

   Unnamed: 0  Clothing ID  Age                    Title  \
0           0          767   33                      NaN   
1           1         1080   34                      NaN   
2           2         1077   60  Some major design flaws   
3           3         1049   50         My favorite buy!   
4           4          847   47         Flattering shirt   

                                         Review Text  Rating  Recommended IND  \
0  Absolutely wonderful - silky and sexy and comf...       4                1   
1  Love this dress!  it's sooo pretty.  i happene...       5                1   
2  I had such high hopes for this dress and reall...       3                0   
3  I love, love, love this jumpsuit. it's fun, fl...       5                1   
4  This shirt is very flattering to all due to th...       5                1   

   Positive Feedback Count   Division Name Department Name Class Name  
0                        0       Initmates        Intimate  Intimates  
1       

In [3]:
# --- Drop Unnecessary Columns ---
# The first column is just an index from the original data source. It's redundant.
df = df.drop(columns=['Unnamed: 0'])

In [4]:
# --- Handle Missing Values ---
# In our EDA, we found 845 rows where 'Review Text' is missing.
# For sentiment analysis and topic modeling, these rows are unusable. So, we remove them.
print(f"Original dataset shape: {df.shape}")
df.dropna(subset=['Review Text'], inplace=True)
print(f"Shape after dropping missing reviews: {df.shape}")

Original dataset shape: (23486, 10)
Shape after dropping missing reviews: (22641, 10)


In [5]:
# --- Handle Duplicates ---
# Let's check for and remove any completely duplicate reviews.
initial_duplicates = df.duplicated().sum()
if initial_duplicates > 0:
    df.drop_duplicates(inplace=True)
    print(f"Removed {initial_duplicates} duplicate rows.")
    print(f"Final shape after removing duplicates: {df.shape}")
else:
    print("No duplicate rows found.")

Removed 1 duplicate rows.
Final shape after removing duplicates: (22640, 10)


In [6]:
# Resetting the index after dropping rows is good practice.
df.reset_index(drop=True, inplace=True)

In [7]:
# --- Create the Sentiment Column ---
# We'll map the numerical 'Rating' to categorical 'Sentiment' labels.
# This logic is based on our EDA: ratings of 4 and 5 are overwhelmingly positive,
# 3 is neutral, and 1 and 2 are negative.
def map_sentiment(rating):
    if rating in [4, 5]:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

df['Sentiment'] = df['Rating'].apply(map_sentiment)

In [8]:
#Building the Text Preprocessing Pipeline
# --- Download NLTK Resources ---
# We only need to do this once. NLTK (Natural Language Toolkit) is a powerful library.
# 'punkt' is for tokenization, 'stopwords' for the list of common words,
# and 'wordnet' for lemmatization.
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('tokenizers/punkt_tab')
    nltk.data.find('corpus/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("Downloading necessary NLTK data...")
    nltk.download('punkt')
    nltk.download('punkt_tab')
    nltk.download('stopwords')
    nltk.download('wordnet')
    print("Downloads complete.")

# Initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Add domain-specific words to our stopword list based on EDA word clouds
# These words are too common in clothing reviews to be useful for topic modeling.
custom_stopwords = ['dress', 'top', 'sweater', 'shirt', 'blouse', 'wear', 'color', 'fabric', 'like']
stop_words.update(custom_stopwords)


def preprocess_text(text):
    """
    A complete text preprocessing pipeline.
    - Converts text to lowercase
    - Removes punctuation and numbers
    - Tokenizes text
    - Removes stopwords
    - Lemmatizes tokens
    """
    # 1. Lowercasing
    text = text.lower()
    
    # 2. Remove punctuation and numbers using regex
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Tokenization
    tokens = word_tokenize(text)
    
    # 4. Remove Stopwords & 5. Lemmatization
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    
    # 6. Join tokens back into a single string
    return " ".join(processed_tokens)

# --- Apply the preprocessing pipeline to our 'Review Text' ---
print("\nApplying text preprocessing pipeline...")
df['processed_text'] = df['Review Text'].apply(preprocess_text)
print("Preprocessing complete.")

# --- Display the results ---
print("\nOriginal vs. Processed Text:")
df[['Review Text', 'processed_text']].head()

Downloading necessary NLTK data...
Downloads complete.

Applying text preprocessing pipeline...


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/huzaifamahmood/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/huzaifamahmood/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/huzaifamahmood/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/huzaifamahmood/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing complete.

Original vs. Processed Text:


Unnamed: 0,Review Text,processed_text
0,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable
1,Love this dress! it's sooo pretty. i happene...,love sooo pretty happened find store glad neve...
2,I had such high hopes for this dress and reall...,high hope really wanted work initially ordered...
3,"I love, love, love this jumpsuit. it's fun, fl...",love love love jumpsuit fun flirty fabulous ev...
4,This shirt is very flattering to all due to th...,flattering due adjustable front tie perfect le...


In [9]:
# Display the final, cleaned DataFrame with our new features
print("\nFinal DataFrame ready for modeling:")
df[['Review Text', 'processed_text', 'Rating', 'Sentiment']].head(10)


Final DataFrame ready for modeling:


Unnamed: 0,Review Text,processed_text,Rating,Sentiment
0,Absolutely wonderful - silky and sexy and comf...,absolutely wonderful silky sexy comfortable,4,positive
1,Love this dress! it's sooo pretty. i happene...,love sooo pretty happened find store glad neve...,5,positive
2,I had such high hopes for this dress and reall...,high hope really wanted work initially ordered...,3,neutral
3,"I love, love, love this jumpsuit. it's fun, fl...",love love love jumpsuit fun flirty fabulous ev...,5,positive
4,This shirt is very flattering to all due to th...,flattering due adjustable front tie perfect le...,5,positive
5,"I love tracy reese dresses, but this one is no...",love tracy reese dress one petite foot tall us...,2,negative
6,I aded this in my basket at hte last mintue to...,aded basket hte last mintue see would look per...,5,positive
7,"I ordered this in carbon for store pick up, an...",ordered carbon store pick ton stuff always try...,4,positive
8,I love this dress. i usually get an xs but it ...,love usually get run little snug bust ordered ...,5,positive
9,"I'm 5""5' and 125 lbs. i ordered the s petite t...",lb ordered petite make sure length wasnt long ...,5,positive


In [10]:
#Feature Engineering
#Age Groups
age_bins = [0, 29, 39, 49, 59, 69, 100]
age_labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70+']

# Create the 'age_group' column using pandas.cut()
df['age_group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

# Display the result
print("Successfully created 'age_group' feature.")
df[['Age', 'age_group']].head()

Successfully created 'age_group' feature.


Unnamed: 0,Age,age_group
0,33,30-39
1,34,30-39
2,60,60-69
3,50,50-59
4,47,40-49


In [11]:
# Text Feature Engineering(TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
# We set max_features to 2000 to keep only the 2000 most important words.
# This helps prevent having too many features and focuses on the most relevant terms.
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))

# Fit and transform the processed text to create the TF-IDF matrix
# This matrix shows the TF-IDF score for each word in each review.
tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text'])

print("TF-IDF matrix created successfully.")
print(f"Shape of the TF-IDF matrix: {tfidf_features.shape}")
print("(Number of Reviews, Number of Features/Words)")

TF-IDF matrix created successfully.
Shape of the TF-IDF matrix: (22640, 2000)
(Number of Reviews, Number of Features/Words)


In [12]:
#Text Feature Engineering(Embeddings)

# Load a pre-trained model. 'all-MiniLM-L6-v2' is a great, fast, all-purpose model.
from sentence_transformers import SentenceTransformer


embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Let's create embeddings for a few sample reviews
sample_reviews = [
    "I absolutely love this dress, the fit is perfect.",
    "The material feels cheap and it runs very small.",
    "This is a gorgeous sweater, great for the holidays."
]

# Generate the embeddings
sentence_embeddings = embedding_model.encode(sample_reviews)

print("Embeddings generated successfully.\n")
for review, embedding in zip(sample_reviews, sentence_embeddings):
    print(f"Review: \"{review}\"")
    # We print the first 5 numbers of the vector for brevity
    print(f"Embedding vector (first 5 dimensions): {embedding[:5]}")
    print(f"Vector shape: {embedding.shape}\n") # This shows it's a dense vector

Embeddings generated successfully.

Review: "I absolutely love this dress, the fit is perfect."
Embedding vector (first 5 dimensions): [-0.07150583  0.0575218   0.04822309  0.00103815  0.00325797]
Vector shape: (384,)

Review: "The material feels cheap and it runs very small."
Embedding vector (first 5 dimensions): [0.01700815 0.10258918 0.07854453 0.05567068 0.08369955]
Vector shape: (384,)

Review: "This is a gorgeous sweater, great for the holidays."
Embedding vector (first 5 dimensions): [-0.05428592  0.09624016  0.01701159  0.02631087  0.04167225]
Vector shape: (384,)



In [13]:
processed_path = 'Data/Processed/Womens_Clothing_E-Commerce_Reviews_Processed.csv'
df.to_csv(processed_path, index=False)
print(f"Processed dataset saved to: {processed_path}")

Processed dataset saved to: Data/Processed/Womens_Clothing_E-Commerce_Reviews_Processed.csv
