In [1]:
!pip install pandas torch transformers "sentence-transformers>=3.0.1" tqdm



In [7]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-21.0.0-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Downloading pyarrow-21.0.0-cp313-cp313-win_amd64.whl (26.1 MB)
   ---------------------------------------- 0.0/26.1 MB ? eta -:--:--
   ------------- -------------------------- 8.9/26.1 MB 51.5 MB/s eta 0:00:01
   ---------------------------------------  26.0/26.1 MB 79.4 MB/s eta 0:00:01
   ---------------------------------------- 26.1/26.1 MB 50.9 MB/s  0:00:00
Installing collected packages: pyarrow
Successfully installed pyarrow-21.0.0


In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)

print("Loading dataset from local 'data/' folder...")

file_path = r'D:\PROJECTS\reviewlens-ai\data\reviews.csv'
df = pd.read_csv(file_path)

print(f"Dataset loaded successfully!")
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

print("\n--- General info about the dataset ---")
df.info()

print("\n--- Preview of the first 5 rows ---")
display(df.head())

print("\n--- Count of missing values per column ---")
display(df.isnull().sum())

Loading dataset from local 'data/' folder...
Dataset loaded successfully!
The dataset has 23486 rows and 11 columns.

--- General info about the dataset ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses



--- Count of missing values per column ---


Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [6]:
print("Starting data cleaning...")

# ----- Action #1: Remove the unnecessary column -----
df_cleaned = df.drop('Unnamed: 0', axis=1)
print("-> 'Unnamed: 0' column removed.")

# ----- Action #2: Handle missing review text (CRUCIAL) -----
df_cleaned.dropna(subset=['Review Text'], inplace=True)
print(f"-> Rows without 'Review Text' removed. Remaining rows: {len(df_cleaned)}")

# ----- Action #3: Combine Title and Review Text -----
df_cleaned['Title'] = df_cleaned['Title'].fillna('')
df_cleaned['full_review_text'] = df_cleaned['Title'] + ' ' + df_cleaned['Review Text']
print("-> 'full_review_text' column created by joining Title and Text.")

# ----- Action #4: Handle missing categorical data -----
df_cleaned.dropna(subset=['Division Name', 'Department Name', 'Class Name'], inplace=True)
print(f"-> Rows with missing category data removed. Final rows: {len(df_cleaned)}")


# ----- Action #5: Final check! -----
print("\n--- Dataset info after cleaning ---")
df_cleaned.info()

print("\n--- Missing values after cleaning ---")
display(df_cleaned.isnull().sum())

print("\n--- Preview of the cleaned dataset ---")
display(df_cleaned.head())

Starting data cleaning...
-> 'Unnamed: 0' column removed.
-> Rows without 'Review Text' removed. Remaining rows: 22641
-> 'full_review_text' column created by joining Title and Text.
-> Rows with missing category data removed. Final rows: 22628

--- Dataset info after cleaning ---
<class 'pandas.core.frame.DataFrame'>
Index: 22628 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Clothing ID              22628 non-null  int64 
 1   Age                      22628 non-null  int64 
 2   Title                    22628 non-null  object
 3   Review Text              22628 non-null  object
 4   Rating                   22628 non-null  int64 
 5   Recommended IND          22628 non-null  int64 
 6   Positive Feedback Count  22628 non-null  int64 
 7   Division Name            22628 non-null  object
 8   Department Name          22628 non-null  object
 9   Class Name               2

Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
full_review_text           0
dtype: int64


--- Preview of the cleaned dataset ---


Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,full_review_text
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps..."
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,Flattering shirt This shirt is very flattering...


In [7]:
from transformers import pipeline
from tqdm.auto import tqdm

print("Loading Sentiment Analysis model...")
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
print("Model loaded successfully!")

df_sample = df_cleaned.sample(500, random_state=42)

print(f"\nStarting sentiment analysis on a sample of {len(df_sample)} reviews...")

def get_sentiment(text):
    result = sentiment_pipeline(text[:512])
    return result[0]

tqdm.pandas()
df_sample['sentiment'] = df_sample['full_review_text'].progress_apply(get_sentiment)

print("\nSentiment analysis on sample completed.")

df_sample['sentiment_label'] = df_sample['sentiment'].apply(lambda d: d['label'])
df_sample['sentiment_score'] = df_sample['sentiment'].apply(lambda d: d['score'])

print("\n--- Preview of the dataset with new sentiment columns ---")
display(df_sample[['Rating', 'full_review_text', 'sentiment_label', 'sentiment_score']].head(10))

Loading Sentiment Analysis model...


Device set to use cpu


Model loaded successfully!

Starting sentiment analysis on a sample of 500 reviews...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:08<00:00, 58.39it/s]



Sentiment analysis on sample completed.

--- Preview of the dataset with new sentiment columns ---


Unnamed: 0,Rating,full_review_text,sentiment_label,sentiment_score
8329,4,"Change armpits I love, love this dress except ...",NEGATIVE,0.885262
17943,2,Awkward sweater I wanted this sweater to work ...,NEGATIVE,0.995292
2157,5,Best. tee. ever. Oh my! i love this tee. it is...,POSITIVE,0.999676
11456,3,Well-made but lacks structure I love the style...,NEGATIVE,0.99377
14386,2,Strangely cut Was super excited to try this on...,NEGATIVE,0.928478
18681,5,"Feminine & clean Size 8 always, 36c and i have...",POSITIVE,0.998242
4124,5,"So comfortable I love the style of this dress,...",POSITIVE,0.999888
7991,5,Perfect lwd! This is perfect! it fits tts. i a...,POSITIVE,0.999842
8409,5,Yessssss!!!!! A culotte and basketball short h...,POSITIVE,0.997367
21423,5,Love the color Love the color and the design. ...,POSITIVE,0.99819


In [9]:
import numpy as np

print("--- Model Validation on Sample ---")

df_sample['sentiment_label_numeric'] = np.where(df_sample['sentiment_label'] == 'POSITIVE', 1, 0)
df_sample['true_sentiment_numeric'] = np.where(df_sample['Rating'] > 3, 1, 0)

accuracy = np.mean(df_sample['sentiment_label_numeric'] == df_sample['true_sentiment_numeric'])

print(f"\nModel accuracy on sample: {accuracy:.2%}")
print("This means the model correctly predicted the sentiment in", f"{accuracy:.2%}", "of cases, based on star ratings.")


print("\n\n--- RUNNING ON THE ENTIRE DATASET ---")
print(f"Applying analysis to all {len(df_cleaned)} reviews. This will take several minutes...")

df_cleaned['sentiment'] = df_cleaned['full_review_text'].progress_apply(get_sentiment)

df_cleaned['sentiment_label'] = df_cleaned['sentiment'].apply(lambda d: d['label'])
df_cleaned['sentiment_score'] = df_cleaned['sentiment'].apply(lambda d: d['score'])

df_cleaned.drop('sentiment', axis=1, inplace=True)

print("\nAnalysis on the full dataset is complete!")
display(df_cleaned[['Rating', 'full_review_text', 'sentiment_label', 'sentiment_score']].head())


--- Model Validation on Sample ---

Model accuracy on sample: 85.40%
This means the model correctly predicted the sentiment in 85.40% of cases, based on star ratings.


--- RUNNING ON THE ENTIRE DATASET ---
Applying analysis to all 22628 reviews. This will take several minutes...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22628/22628 [06:28<00:00, 58.29it/s]



Analysis on the full dataset is complete!


Unnamed: 0,Rating,full_review_text,sentiment_label,sentiment_score
0,4,Absolutely wonderful - silky and sexy and com...,POSITIVE,0.999877
1,5,Love this dress! it's sooo pretty. i happen...,POSITIVE,0.998684
2,3,Some major design flaws I had such high hopes ...,POSITIVE,0.98069
3,5,"My favorite buy! I love, love, love this jumps...",POSITIVE,0.999885
4,5,Flattering shirt This shirt is very flattering...,POSITIVE,0.999418


In [10]:
print("Saving the enriched dataset...")

output_path = "../data/reviews_silver.parquet"

final_columns = [
    'Clothing ID', 'Age', 'Rating', 'Recommended IND',
    'Positive Feedback Count', 'Division Name', 'Department Name',
    'Class Name', 'full_review_text', 'sentiment_label', 'sentiment_score'
]

df_final = df_cleaned[final_columns]

df_final.to_parquet(output_path, index=False)

print(f"Final dataset successfully saved to: '{output_path}'")
print("\n--- Preview of the final dataset we saved ---")
display(df_final.head())

Saving the enriched dataset...
Final dataset successfully saved to: '../data/reviews_silver.parquet'

--- Preview of the final dataset we saved ---


Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,full_review_text,sentiment_label,sentiment_score
0,767,33,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...,POSITIVE,0.999877
1,1080,34,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...,POSITIVE,0.998684
2,1077,60,3,0,0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...,POSITIVE,0.98069
3,1049,50,5,1,0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps...",POSITIVE,0.999885
4,847,47,5,1,6,General,Tops,Blouses,Flattering shirt This shirt is very flattering...,POSITIVE,0.999418
