In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split

In [3]:
#load dataset
data = pd.read_csv('data/raw/reviews.csv')
print("Dataset loaded successfully.")

Dataset loaded successfully.


In [None]:
# Initial Cleaning (Rows)
# Drop missing text and duplicates as identified in EDA
print(f"Original shape: {data.shape}")
data.dropna(subset=['reviewText', 'rating'], inplace=True)
data.drop_duplicates(subset=['reviewText'], inplace=True)
print(f"Shape after cleaning rows: {data.shape}")

Original shape: (6327, 23)
Shape after cleaning rows: (6232, 23)


In [6]:
# Label Engineering (The Target Variable)
# Filter out 3-star reviews to remove ambiguity
data = data[data['rating'] != 3]

In [7]:
# Create Binary Sentiment: 1 = Positive (4-5 stars), 0 = Negative (1-2 stars)
data['sentiment'] = data['rating'].apply(lambda x: 1 if x > 3 else 0)

print(f"Shape after cleaning: {data.shape}")
print("Class Distribution:\n", data['sentiment'].value_counts())

Shape after cleaning: (5820, 24)
Class Distribution:
 sentiment
1    5607
0     213
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment'] = data['rating'].apply(lambda x: 1 if x > 3 else 0)


In [None]:
#Instead of cleaning the text manually, we will wrap our cleaning logic into a Scikit-Learn Transformer
from sklearn.base import BaseEstimator, TransformerMixin

class TextCleaner(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to clean raw text.
    Compatible with Scikit-Learn Pipelines.
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # just return self
        return self

    def transform(self, X):
        # X is a pandas Series or list of text
        # We apply the cleaning function to every item
        return [self._clean_text(text) for text in X]

    def _clean_text(self, text):
        # 1. Lowercase
        text = str(text).lower()

        # 2. Remove HTML tags (e.g., <br /> common in Amazon data)
        text = re.sub(r'<.*?>', '', text)

        # 3. Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # 4. Remove Punctuation
        # This translation table replaces every punctuation mark with None
        text = text.translate(str.maketrans('', '', string.punctuation))

        # 5. Remove numbers (optional, usually good for sentiment)
        text = re.sub(r'\d+', '', text)

        return text

In [12]:
#Text Vectorization (From Words to Numbers)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Define the Preprocessing Pipeline
# Steps:
# 1. 'cleaner': Runs our custom TextCleaner
# 2. 'tfidf': Converts text to numbers
preprocessing_pipeline = Pipeline([
    ('cleaner', TextCleaner()),
    ('tfidf', TfidfVectorizer(
        max_features=5000,      # Only keep top 5000 words (reduces noise)
        stop_words='english',   # Remove common words (the, a, an)
        ngram_range=(1, 2)      # Capture "not good" as a phrase (Bigrams)
    ))
])

print("Pipeline constructed successfully.")

Pipeline constructed successfully.


In [15]:
# Define Features (X) and Target (y)
X = data['reviewText']
y = data['sentiment']

# 2. Stratified Split
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # This ensures balanced classes in split
)

print(f"Training Data: {len(X_train)} reviews")
print(f"Testing Data: {len(X_test)} reviews")

# Verify transformation
# Let's run the training data through the pipeline to check dimensions
# (Note: In Part 4, we will actually 'fit' this. Here we just check.)
print("\nDry run of preprocessing pipeline on sample data...")
sample_vector = preprocessing_pipeline.fit_transform(X_train[:100])
print(f"Vector shape (Rows, Features): {sample_vector.shape}")

Training Data: 4656 reviews
Testing Data: 1164 reviews

Dry run of preprocessing pipeline on sample data...
Vector shape (Rows, Features): (100, 3891)


In [16]:
# Save cleaned & split datasets for modeling
import os
os.makedirs('data/processed', exist_ok=True)

# Use the pipeline's TextCleaner to produce cleaned text columns
cleaner = preprocessing_pipeline.named_steps['cleaner']

# Clean train / test text (X_train and X_test are pandas Series)
clean_train = cleaner.transform(X_train)
clean_test = cleaner.transform(X_test)
clean_full = cleaner.transform(X)  # optional: full dataset cleaned

# Build DataFrames to save
df_train = pd.DataFrame({
    'reviewText': X_train.values,
    'clean_reviewText': clean_train,
    'sentiment': y_train.values
})

df_test = pd.DataFrame({
    'reviewText': X_test.values,
    'clean_reviewText': clean_test,
    'sentiment': y_test.values
})

df_full = pd.DataFrame({
    'reviewText': X.values,
    'clean_reviewText': clean_full,
    'sentiment': y.values
})

# Write CSVs to data/processed
df_train.to_csv('data/processed/preprocessed_train.csv', index=False)
df_test.to_csv('data/processed/preprocessed_test.csv', index=False)
df_full.to_csv('data/processed/preprocessed_full.csv', index=False)

print("Saved: data/processed/preprocessed_train.csv, preprocessed_test.csv, preprocessed_full.csv")
# ...existing code...

Saved: data/processed/preprocessed_train.csv, preprocessed_test.csv, preprocessed_full.csv
