# Task 1
Web-Scraping Performed in `NLP_WebScraping.ipynb`, exported to three files
- `./data/scraped/meesho_product_details.csv`
- `./data/scraped/meesho_product_links.csv`
- `./data/scraped/meesho_product_reviews.csv`

`./data/scraped/meesho_product_details.csv` has been translated using Google Translate to `./data/meesho_product_reviews_translated.csv`

# Pre-Preprocess Data for Task 2

Import Libraries

In [None]:
import pandas as pd
import emoji
import re

Read CSV

In [None]:
review_df = pd.read_csv("./data/meesho_product_reviews_translated.csv", encoding="utf-8")
review_df.head()

Unnamed: 0,product_id,username,rating,review,date,helpful_count,review_translated
0,4obtb7,Meesho User,4.0,It's very good but there is very little lipsti...,26-Mar-25,11,It's very good but there is very little lipst...
1,4obtb7,Vikash Paswan,4.0,Bahut achha hai water proof hai ❤️❤️❤️ lekin i...,23-Oct-23,346,"It is very good, it is water proof ❤️❤️❤️ but..."
2,4obtb7,Priyanka Kashyap,4.0,Lipstick ka shade shi h pr 2 kajal hi diya h w...,06-Oct-24,78,The shade of lipstick is good but I have give...
3,4obtb7,usha jaat ghintala Ghintala,4.0,Lipstick shades bhot ache hai kajal liners to ...,21-Mar-25,23,"Lipstick shades are very good, kajal liners a..."
4,4obtb7,Pushpa Marathe,4.0,4 lipstick 💄 nighalaya aani 3 kajal pan eaylin...,25-Nov-24,12,4 lipstick 💄 nighalaya aani 3 kajal pan eyeli...


In [None]:
print(review_df.shape)
print(review_df.dtypes)

(25000, 7)
product_id            object
username              object
rating               float64
review                object
date                  object
helpful_count          int64
review_translated     object
dtype: object


Set Datatypes

In [None]:
dtype_dict = {column_name: "str" for column_name in review_df.columns}
dtype_dict["rating"] = "float64"
dtype_dict["helpful_count"] = "int64"

review_df = review_df.astype(dtype_dict)

review_df["date"] = review_df["date"].apply(pd.to_datetime)

print(review_df.dtypes)

review_df.head()

product_id                   object
username                     object
rating                      float64
review                       object
date                 datetime64[ns]
helpful_count                 int64
review_translated            object
dtype: object


Unnamed: 0,product_id,username,rating,review,date,helpful_count,review_translated
0,4obtb7,Meesho User,4.0,It's very good but there is very little lipsti...,2025-03-26,11,It's very good but there is very little lipst...
1,4obtb7,Vikash Paswan,4.0,Bahut achha hai water proof hai ❤️❤️❤️ lekin i...,2023-10-23,346,"It is very good, it is water proof ❤️❤️❤️ but..."
2,4obtb7,Priyanka Kashyap,4.0,Lipstick ka shade shi h pr 2 kajal hi diya h w...,2024-10-06,78,The shade of lipstick is good but I have give...
3,4obtb7,usha jaat ghintala Ghintala,4.0,Lipstick shades bhot ache hai kajal liners to ...,2025-03-21,23,"Lipstick shades are very good, kajal liners a..."
4,4obtb7,Pushpa Marathe,4.0,4 lipstick 💄 nighalaya aani 3 kajal pan eaylin...,2024-11-25,12,4 lipstick 💄 nighalaya aani 3 kajal pan eyeli...


Replace common misspellings

In [None]:
# your misspellings->pattern map
mis_spellings = [
    ("Meesho", r"\bm[ei]+\s?sh?[ouei]+w?\b"),
    ("awesome", r"\b[ao]+w?e?s+o*m+e*\b"),
    ("wow", r"\bwo[ow]*w\b"),
    ("damn", r"\bd[ae]+[mn]*\b"),
    ("", r"\bnan\b"),  # drop any standalone “nan”
    ("", r"'"),  # remove inverted commas
    ("", r'"'),  # remove inverted commas
    ("I", r"\bi\b"),  # capitalize standalone Is
]


def clean_text(s: str) -> str:
    # 1) lowercase & demojize
    s = s.lower()
    s = emoji.demojize(s)
    # extra idea: strip out URLs & HTML tags
    s = re.sub(r"http\S+", "", s)
    s = re.sub(r"<.*?>", "", s)

    # 2) split on ANY char not in a–z, 0–9, underscore, colon, or inverted commas
    #    this keeps “:red_heart:” intact (we’ll break it apart next)
    tokens = re.split(r'[^a-zA-Z0-9_:,!\?\.\']+', s)
    tokens = [t for t in tokens if t]  # drop empties

    # reconstruct emoji tokens
    expanded = []
    for t in tokens:
        if ':' in t:
            emojis = t.split(":")
            emojis = list(set(f":{e}:" for e in emojis if e))

            if t[0] != ":":
                emojis[0] = emojis[0][1:-1]
            
            if t[-1] != ":":
                emojis[-1] = emojis[-1][1:-1]
                
            expanded.extend(emojis)
        else:
            expanded.append(t)
    
    # 3) trim again & drop any stray empties
    expanded = [t.strip() for t in expanded if t.strip()]

    # 4) apply all your regex-based corrections in one pass
    corrected = []
    for token in expanded:
        replaced = token
        for good, pat in mis_spellings:
            if re.fullmatch(pat, token, flags=re.IGNORECASE):
                replaced = good
                break
        # drop tokens you've mapped to "" (e.g. nan)
        if replaced:
            corrected.append(replaced)
    # finally, re-join with single spaces
    return " ".join(corrected)

In [None]:
# usage on your DataFrame:
review_df["review_clean"] = review_df["review_translated"].apply(clean_text).astype("str")
review_df["review_clean"]

0        it's very good but there is very little lipsti...
1        it is very good, it is water proof :red_heart:...
2        the shade of lipstick is good but I have given...
3        lipstick shades are very good, kajal liners ar...
4        4 lipstick :lipstick: nighalaya aani 3 kajal p...
                               ...                        
24995                                       as the same as
24996                                            very good
24997                                            very good
24998                                            thank you
24999                                                     
Name: review_clean, Length: 25000, dtype: object

Demojize for easy tokenization

In [None]:
review_df.head()

Unnamed: 0,product_id,username,rating,review,date,helpful_count,review_translated,review_clean
0,4obtb7,Meesho User,4.0,It's very good but there is very little lipsti...,2025-03-26,11,It's very good but there is very little lipst...,it's very good but there is very little lipsti...
1,4obtb7,Vikash Paswan,4.0,Bahut achha hai water proof hai ❤️❤️❤️ lekin i...,2023-10-23,346,"It is very good, it is water proof ❤️❤️❤️ but...","it is very good, it is water proof :red_heart:..."
2,4obtb7,Priyanka Kashyap,4.0,Lipstick ka shade shi h pr 2 kajal hi diya h w...,2024-10-06,78,The shade of lipstick is good but I have give...,the shade of lipstick is good but I have given...
3,4obtb7,usha jaat ghintala Ghintala,4.0,Lipstick shades bhot ache hai kajal liners to ...,2025-03-21,23,"Lipstick shades are very good, kajal liners a...","lipstick shades are very good, kajal liners ar..."
4,4obtb7,Pushpa Marathe,4.0,4 lipstick 💄 nighalaya aani 3 kajal pan eaylin...,2024-11-25,12,4 lipstick 💄 nighalaya aani 3 kajal pan eyeli...,4 lipstick :lipstick: nighalaya aani 3 kajal p...


In [None]:
total_chars = review_df["review_clean"].str.len().sum()
print("Total characters:", total_chars)

Total characters: 1018201


In [None]:
cleaned_df = review_df.filter(
    ["product_id", "rating", "date", "helpful_count", "review_clean"]
)
cleaned_df = cleaned_df.rename(columns={"review_clean": "review"})

# drop rows where review is empty or only whitespace
cleaned_df = cleaned_df.loc[
    cleaned_df["review"].str.strip().astype(bool)
]

cleaned_df.head(10)

Unnamed: 0,product_id,rating,date,helpful_count,review
0,4obtb7,4.0,2025-03-26,11,it's very good but there is very little lipsti...
1,4obtb7,4.0,2023-10-23,346,"it is very good, it is water proof :red_heart:..."
2,4obtb7,4.0,2024-10-06,78,the shade of lipstick is good but I have given...
3,4obtb7,4.0,2025-03-21,23,"lipstick shades are very good, kajal liners ar..."
4,4obtb7,4.0,2024-11-25,12,4 lipstick :lipstick: nighalaya aani 3 kajal p...
5,4obtb7,4.0,2024-11-26,6,the lipstick shade is dry :..it: face_with_rol...
6,4obtb7,4.0,2025-04-01,0,wow so good color thankyou :smiling_face_with_...
7,4obtb7,5.0,2025-03-31,0,this is beautiful product and color shade is v...
8,4obtb7,4.0,2024-10-04,9,I have received this product and the liner is ...
9,4obtb7,4.0,2024-12-06,3,the product is good peking is nice but not a k...


In [None]:
cleaned_df.to_csv("./data/meesho_reviews_cleaned.csv", index=False)

# Task 2.1, 2.2

## Task 2.1 - Overall Sentiment Analysis
### Step 1: Finetune Sentiment Model
#### Use `./data/meesho_reviews_cleaned.csv`

### Install and Import Libraries

In [None]:
! pip install datasets evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
from datasets import Dataset
import evaluate

### Read dataset, split into train and testsets

In [None]:
# 1. Load your CSV file
# Make sure the CSV has columns: 'review' (text) and 'rating' (float or int 1-5)
df = pd.read_csv('./data/meesho_reviews_cleaned.csv')
df['review'] = df['review'].astype(str)

# 2. Prepare labels: convert rating (1-5) to 0-4
# This aligns with model's 5 output classes
df['label'] = df['rating'].astype(int) - 1

# 3. Split into train/validation sets
train_df, val_df = train_test_split(
    df[['review', 'label']],
    test_size=0.1,
    random_state=42,
    stratify=df['label']
)

# 4. Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

In [None]:
train_df.head()

Unnamed: 0,review,label
21911,superb :OK_hand_light_skin_tone:,4
15050,nice,4
9622,my lipstick is broken :frowning_face:,2
11680,worth buying,4
20239,"1 color change, only remaining sabu bhala",4


In [None]:
val_df.head()

Unnamed: 0,review,label
18401,very good product and color :red_heart: :rose:,4
10923,very good product I loved it :smiling_face_wit...,4
5928,nice you can go for it,4
1295,good quality awesome shades,3
16363,superb,3


### Instantiate Model and Trainers

In [None]:
# 5. Load pretrained tokenizer & model
model_name = "LiYuan/amazon-review-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5  # five classes for 1-5 stars
)

# 6. Tokenization function
def preprocess(examples):
    return tokenizer(
        examples['review'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

# 7. Tokenize datasets
train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset   = val_dataset.map(preprocess, batched=True)

# 8. Format for PyTorch
train_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label']
)
val_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'label']
)

# 9. Define compute_metrics for evaluation
accuracy_metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=preds, references=labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

Map:   0%|          | 0/22025 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

Map:   0%|          | 0/2448 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# 10. Set up training arguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir='./results',           # where to save checkpoints
    overwrite_output_dir=True,        # if you want to wipe './results' each run
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,

    # --- legacy names, per your doc ---
    eval_strategy='steps',            # do evaluation every eval_steps
    eval_steps=500,                   # how often to eval
    save_strategy='steps',            # checkpoint save strategy
    save_steps=500,                   # how often to checkpoint
    logging_strategy='steps',         # log every logging_steps
    logging_steps=100,

    load_best_model_at_end=True,      # pick the best checkpoint by your metric
    metric_for_best_model='accuracy', # which metric to compare

    # (optional) explicitly turn on train/eval
    do_train=True,
    do_eval=True,

    report_to="none",       # ← disable all logging backends (including wandb)
    run_name=None,          # ← avoid the default warning
)


# 11. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


### Train and Save Model

In [None]:
# 12. Start training
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.6654,0.663252,0.747549
1000,0.6345,0.644195,0.742647
1500,0.6189,0.655635,0.742647
2000,0.5974,0.65018,0.745507
2500,0.5848,0.641517,0.744281
3000,0.6155,0.658206,0.743056
3500,0.5929,0.661921,0.74469
4000,0.5764,0.652185,0.750817
4500,0.539,0.70988,0.745915
5000,0.5566,0.68041,0.746324


KeyboardInterrupt: 

In [None]:
# 13. Save the fine-tuned model
trainer.save_model('./model/finetuned_meesho_sentiment')

In [None]:
from transformers import pipeline

overall_sentiment_pipe = pipeline(task='text-classification', model="./results/checkpoint-4000")

Device set to use cuda:0


In [None]:
! zip finetuned_meesho_sentiment.zip ./results/checkpoint-4000/*

  adding: results/checkpoint-4000/config.json (deflated 56%)
  adding: results/checkpoint-4000/model.safetensors (deflated 7%)
  adding: results/checkpoint-4000/optimizer.pt (deflated 52%)
  adding: results/checkpoint-4000/rng_state.pth (deflated 25%)
  adding: results/checkpoint-4000/scheduler.pt (deflated 55%)
  adding: results/checkpoint-4000/special_tokens_map.json (deflated 80%)
  adding: results/checkpoint-4000/tokenizer_config.json (deflated 74%)
  adding: results/checkpoint-4000/tokenizer.json (deflated 69%)
  adding: results/checkpoint-4000/trainer_state.json (deflated 76%)
  adding: results/checkpoint-4000/training_args.bin (deflated 51%)
  adding: results/checkpoint-4000/vocab.txt (deflated 48%)


In [None]:
! du -h ./finetuned_meesho_sentiment.zip

1.2G	./finetuned_meesho_sentiment.zip


## Task 2.1, 2.2 - Analyse Overall and Aspect-Based Sentiment and Collect Results
#### Use `./data/meesho_reviews_cleaned.csv`

### Import Libraries, Create Helper Functions and Instantiate Models

In [None]:
from transformers import pipeline

overall_sentiment_pipe = pipeline(task='text-classification', model="./results/checkpoint-4000")

aspect_sentiment_pipe = pipeline(task='ner', aggregation_strategy='simple', model="gauneg/roberta-base-absa-ate-sentiment")

Device set to use cuda:0


config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def get_sentiment(match_str: str, positive: str="pos", negative:str="neg", neutral:str="neu")->str:
  if match_str == positive:
    return "POSITIVE"
  if match_str == negative:
    return "NEGATIVE"
  if match_str == neutral:
    return "NEUTRAL"
  return "None"

### Set Input Dataframe

In [None]:
review_df = pd.read_csv("./data/meesho_reviews_cleaned.csv", encoding="utf-8")
review_df.head()

Unnamed: 0,product_id,rating,date,helpful_count,review
0,4obtb7,4.0,2025-03-26,11,it's very good but there is very little lipsti...
1,4obtb7,4.0,2023-10-23,346,"it is very good, it is water proof :red_heart:..."
2,4obtb7,4.0,2024-10-06,78,the shade of lipstick is good but I have given...
3,4obtb7,4.0,2025-03-21,23,"lipstick shades are very good, kajal liners ar..."
4,4obtb7,4.0,2024-11-25,12,4 lipstick :lipstick: nighalaya aani 3 kajal p...


### Calculate Sentiments

In [None]:
from tqdm import tqdm

# Compute labels
review_df["label"] = [
    int(y["label"][0]) for y in overall_sentiment_pipe(
      x for x in tqdm(review_df["review"])
    )
  ]

100%|██████████| 24473/24473 [04:05<00:00, 99.64it/s] 


In [None]:
review_df["absa_result"] = [
    y for y in aspect_sentiment_pipe(
      x for x in tqdm(review_df["review"])
    )
  ]

100%|██████████| 24473/24473 [04:29<00:00, 90.91it/s] 


In [None]:
from typing import Any

def process_absa_result(word_list: list[dict[str, Any]]) -> dict[str, list[str]]:
  processed_words: dict[str, list[str]] = {
      "pos" : [],
      "neg" : [],
      "neu" : []
  }

  for word_info in word_list:
    sentiment, word = word_info["entity_group"], word_info["word"]
    processed_words[sentiment].append(word)

  return processed_words

In [None]:
review_df["processed_absa_result"] = review_df["absa_result"].apply(process_absa_result)

In [None]:
def get_pos(inp_dict: dict[str, list[str]])-> list[str]:
  return inp_dict["pos"]

def get_neg(inp_dict: dict[str, list[str]])-> list[str]:
  return inp_dict["neg"]

def get_neu(inp_dict: dict[str, list[str]])-> list[str]:
  return inp_dict["neu"]

In [None]:
review_df["pos_words"] = review_df["processed_absa_result"].apply(get_pos)
review_df["neg_words"] = review_df["processed_absa_result"].apply(get_neg)
review_df["neu_words"] = review_df["processed_absa_result"].apply(get_neu)
review_df["label"] = review_df["label"].astype("float")

In [None]:
review_df.head()

Unnamed: 0,product_id,rating,date,helpful_count,review,label,absa_result,processed_absa_result,pos_words,neg_words,neu_words
0,4obtb7,4.0,2025-03-26,11,it's very good but there is very little lipsti...,5.0,"[{'entity_group': 'neg', 'score': 0.77126294, ...","{'pos': [], 'neg': [' lipstick'], 'neu': []}",[],[ lipstick],[]
1,4obtb7,4.0,2023-10-23,346,"it is very good, it is water proof :red_heart:...",4.0,"[{'entity_group': 'neg', 'score': 0.597518, 'w...","{'pos': [], 'neg': [' k'], 'neu': []}",[],[ k],[]
2,4obtb7,4.0,2024-10-06,78,the shade of lipstick is good but I have given...,4.0,"[{'entity_group': 'pos', 'score': 0.5625188, '...","{'pos': [' shade of lipstick'], 'neg': [' pack...",[ shade of lipstick],[ packaging],[]
3,4obtb7,4.0,2025-03-21,23,"lipstick shades are very good, kajal liners ar...",5.0,"[{'entity_group': 'pos', 'score': 0.6172742, '...","{'pos': ['lipstick shades', ' k', 'al liners']...","[lipstick shades, k, al liners]",[ lipstick],[]
4,4obtb7,4.0,2024-11-25,12,4 lipstick :lipstick: nighalaya aani 3 kajal p...,4.0,[],"{'pos': [], 'neg': [], 'neu': []}",[],[],[]


In [None]:
result_df = review_df.drop(["absa_result", "processed_absa_result"], axis = 1)
result_df.head()

Unnamed: 0,product_id,rating,date,helpful_count,review,label,pos_words,neg_words,neu_words
0,4obtb7,4.0,2025-03-26,11,it's very good but there is very little lipsti...,5.0,[],[ lipstick],[]
1,4obtb7,4.0,2023-10-23,346,"it is very good, it is water proof :red_heart:...",4.0,[],[ k],[]
2,4obtb7,4.0,2024-10-06,78,the shade of lipstick is good but I have given...,4.0,[ shade of lipstick],[ packaging],[]
3,4obtb7,4.0,2025-03-21,23,"lipstick shades are very good, kajal liners ar...",5.0,"[lipstick shades, k, al liners]",[ lipstick],[]
4,4obtb7,4.0,2024-11-25,12,4 lipstick :lipstick: nighalaya aani 3 kajal p...,4.0,[],[],[]


In [None]:
result_df.to_csv("./data/meesho_sentiment_results.csv")

# Pre-Process Data for Task 2.3 and 3

In [None]:
import pandas as pd

In [None]:
review_df = pd.read_csv("./data/meesho_sentiment_results.csv", encoding="utf-8")
review_df.head()

Unnamed: 0.1,Unnamed: 0,product_id,rating,date,helpful_count,review,label,pos_words,neg_words,neu_words
0,0,4obtb7,4.0,2025-03-26,11,it's very good but there is very little lipsti...,5.0,[],[' lipstick'],[]
1,1,4obtb7,4.0,2023-10-23,346,"it is very good, it is water proof :red_heart:...",4.0,[],[' k'],[]
2,2,4obtb7,4.0,2024-10-06,78,the shade of lipstick is good but I have given...,4.0,[' shade of lipstick'],[' packaging'],[]
3,3,4obtb7,4.0,2025-03-21,23,"lipstick shades are very good, kajal liners ar...",5.0,"['lipstick shades', ' k', 'al liners']",[' lipstick'],[]
4,4,4obtb7,4.0,2024-11-25,12,4 lipstick :lipstick: nighalaya aani 3 kajal p...,4.0,[],[],[]


Function to lemmatize words

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()


def _treebank_to_wordnet_pos(tb_tag: str) -> str:
    """Map NLTK POS tags to the simpler WordNet set: n, v, a, r."""
    if tb_tag.startswith("J"):
        return wordnet.ADJ
    elif tb_tag.startswith("V"):
        return wordnet.VERB
    elif tb_tag.startswith("N"):
        return wordnet.NOUN
    elif tb_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun


def lemmatize_list(words: list[str]) -> list[str]:
    """
    Given a list of tokens (no context needed),
    POS-tag them, map to WordNet POS, and lemmatize.
    """
    # 1) tag
    tagged = nltk.pos_tag(words)
    # 2) lemmatize with appropriate POS
    lemmas = []
    for word, tb_tag in tagged:
        wn_pos = _treebank_to_wordnet_pos(tb_tag)
        lemmas.append(lemmatizer.lemmatize(word, pos=wn_pos))
    return lemmas

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...


Clean Word Lists

In [None]:
def clean_str(str_l: str) -> set[str]:
    word_list: list[str] = eval(str_l)
    clean_words = []

    for word in word_list:
        split_words = word.split()
        split_words = [w.strip() for w in split_words]
        split_words = [w for w in split_words if len(w) > 2]
        clean_words.extend(split_words)

    clean_words = lemmatize_list(clean_words)

    return set(clean_words)

In [None]:
review_df["pos_words"] = review_df["pos_words"].apply(clean_str)
review_df["neg_words"] = review_df["neg_words"].apply(clean_str)
review_df["neu_words"] = review_df["neu_words"].apply(clean_str)
review_df.head()

Unnamed: 0.1,Unnamed: 0,product_id,rating,date,helpful_count,review,label,pos_words,neg_words,neu_words
0,0,4obtb7,4.0,2025-03-26,11,it's very good but there is very little lipsti...,5.0,{},{lipstick},{}
1,1,4obtb7,4.0,2023-10-23,346,"it is very good, it is water proof :red_heart:...",4.0,{},{},{}
2,2,4obtb7,4.0,2024-10-06,78,the shade of lipstick is good but I have given...,4.0,"{shade, lipstick}",{packaging},{}
3,3,4obtb7,4.0,2025-03-21,23,"lipstick shades are very good, kajal liners ar...",5.0,"{shade, lipstick, liner}",{lipstick},{}
4,4,4obtb7,4.0,2024-11-25,12,4 lipstick :lipstick: nighalaya aani 3 kajal p...,4.0,{},{},{}


## Calculate User Review Scores
$$word\_freq\_{p}(w) = \sum_{r \in R_p, w \in w_r} ln(1 + h_r); w \in \{pos\_words, neg\_words, neu\_words\}$$
$$tot\_freq_p{w} = pos\_freq_p(w) - neg\_freq_p(w) + 0.1 \cdot neu\_freq_p(w)$$
$$score_{p}(w) = \frac{\sum_{r \in R_p} label_r \cdot ln(1 + h_r)}{|R_p|}$$


In [None]:
from collections import defaultdict
import math

pos_freq = {}
neg_freq = {}
neu_freq = {}
tot_freq = {}
score_count = {}
top_reviews = {}

# Group by product
for pid, grp in review_df.groupby("product_id"):
    p_counter = defaultdict(float)
    n_counter = defaultdict(float)
    u_counter = defaultdict(float)  # "u" for neutral
    t_counter = defaultdict(float)  # "t" for total
    score = 0.0
    count = 0

    for _, row in grp.iterrows():
        weight = math.log(1 + row["helpful_count"])
        # add weight for every word in those sets
        for w in row["pos_words"]:
            p_counter[w] += weight
        for w in row["neg_words"]:
            n_counter[w] += weight
        for w in row["neu_words"]:
            u_counter[w] += weight

        score += row["label"] * weight
        count += 1

    for w, val in p_counter.items():
        t_counter[w] += val

    for w, val in n_counter.items():
        t_counter[w] -= val

    for w, val in u_counter.items():
        t_counter[w] += 0.1 * val

    top5 = grp.sort_values("helpful_count", ascending=False).head(5)["review"].tolist()
    # join with a space (or "\n\n" if you prefer paragraphs)
    top_reviews[pid] = " ".join(top5)

    pos_freq[pid] = dict(p_counter)
    neg_freq[pid] = dict(n_counter)
    neu_freq[pid] = dict(u_counter)
    tot_freq[pid] = dict(t_counter)
    score_count[pid] = score / count

# Turn into a small DataFrame for display
result = pd.DataFrame(
    {
        "product_id": list(pos_freq),
        "pos_freq": [pos_freq[pid] for pid in pos_freq],
        "neg_freq": [neg_freq[pid] for pid in pos_freq],
        "neu_freq": [neu_freq[pid] for pid in pos_freq],
        "tot_freq": [tot_freq[pid] for pid in pos_freq],
        "user_score": [score_count[pid] for pid in pos_freq],
        "top_reviews": [top_reviews[pid] for pid in pos_freq],
    }
)

result.head()

Unnamed: 0,product_id,pos_freq,neg_freq,neu_freq,tot_freq,user_score,top_reviews
0,1335kh,"{'packaging': 7.4547199493640015, 'pack': 15.8...","{'lipstick': 24.19884814440708, 'quantity': 1....","{'tone': 0.0, 'price': 6.633318433280377, 'lip...","{'packaging': 7.4547199493640015, 'pack': 15.8...",3.492094,"it is very good, I liked it very much, the siz..."
1,1504gk,"{'shade': 4.564348191467836, 'product': 38.756...","{'color': 8.451053388911692, 'finish': 0.0, 'c...","{'bill': 3.1354942159291497, 'colour': 0.0, 'c...","{'shade': 4.564348191467836, 'product': 38.063...",1.770107,so beautiful so alligator just like wow ??????...
2,157rai,"{'price': 40.84930966632612, 'quality': 26.634...","{'lipstick': 29.89867021784719, 'color': 9.552...","{'price': 2.6390573296152584, 'color': 0.69314...","{'price': 37.781010889112444, 'quality': 24.55...",3.033708,it is okay.... it is soft :thumbs_up: but it i...
3,1axxae,"{'color': 7.847762537473608, 'product': 15.094...","{'color': 0.6931471805599453, 'product': 0.693...","{'bon': 1.791759469228055, 'lipstick': 2.77258...","{'color': 7.154615356913663, 'product': 14.401...",0.604581,"I asked for 2 lipsticks, both were very nice, ..."
4,1blfrk,"{'pigment': 0.0, 'shade': 14.353686722313057, ...","{'lipstick': 13.704024653692851, 'size': 8.494...","{'lipstick': 0.6931471805599453, 'tone': 1.098...","{'pigment': 0.0, 'shade': 14.353686722313057, ...",0.967856,packing was good... new lipstick users can buy...


Extract top 3 and bottom 3 aspects for words

In [None]:
import heapq


def top_and_bottom_3(
    scores: dict[str, float],
) -> tuple[list[str], list[str]]:
    """
    Return two lists:
      - top k keys by value (highest scores)
      - bottom k keys by value (lowest scores)
    If there are fewer than k items, you get as many as exist.
    """
    k: int = 3
    # get the k keys with largest values
    top_k = set(heapq.nlargest(k, scores, key=scores.get))
    # get the k keys with smallest values
    bottom_k = set(heapq.nsmallest(k, scores, key=scores.get)) - top_k

    return list(top_k), list(bottom_k)

In [None]:
buffer = result["tot_freq"].apply(top_and_bottom_3)
result["pos_3"] = buffer.apply(lambda x: x[0])
result["neg_3"] = buffer.apply(lambda x: x[1])
result.head()

Unnamed: 0,product_id,pos_freq,neg_freq,neu_freq,tot_freq,user_score,top_reviews,pos_3,neg_3
0,1335kh,"{'packaging': 7.4547199493640015, 'pack': 15.8...","{'lipstick': 24.19884814440708, 'quantity': 1....","{'tone': 0.0, 'price': 6.633318433280377, 'lip...","{'packaging': 7.4547199493640015, 'pack': 15.8...",3.492094,"it is very good, I liked it very much, the siz...","[lipstick, color, product]","[quantity, lip, size]"
1,1504gk,"{'shade': 4.564348191467836, 'product': 38.756...","{'color': 8.451053388911692, 'finish': 0.0, 'c...","{'bill': 3.1354942159291497, 'colour': 0.0, 'c...","{'shade': 4.564348191467836, 'product': 38.063...",1.770107,so beautiful so alligator just like wow ??????...,"[quality, color, product]","[mate, colour, price]"
2,157rai,"{'price': 40.84930966632612, 'quality': 26.634...","{'lipstick': 29.89867021784719, 'color': 9.552...","{'price': 2.6390573296152584, 'color': 0.69314...","{'price': 37.781010889112444, 'quality': 24.55...",3.033708,it is okay.... it is soft :thumbs_up: but it i...,"[color, product, price]","[quant, transfer, size]"
3,1axxae,"{'color': 7.847762537473608, 'product': 15.094...","{'color': 0.6931471805599453, 'product': 0.693...","{'bon': 1.791759469228055, 'lipstick': 2.77258...","{'color': 7.154615356913663, 'product': 14.401...",0.604581,"I asked for 2 lipsticks, both were very nice, ...","[lipstick, color, product]","[gold, pigmentation, finish]"
4,1blfrk,"{'pigment': 0.0, 'shade': 14.353686722313057, ...","{'lipstick': 13.704024653692851, 'size': 8.494...","{'lipstick': 0.6931471805599453, 'tone': 1.098...","{'pigment': 0.0, 'shade': 14.353686722313057, ...",0.967856,packing was good... new lipstick users can buy...,"[shade, color, product]","[smell, size, price]"


Merge with prod dataset

In [None]:
prod_df = pd.read_csv("./data/scraped/meesho_product_details.csv")
prod_df = prod_df.rename(
    columns={col: col.lower().replace(" ", "_") for col in prod_df.columns}
)
prod_df["price"] = prod_df["price"].apply(lambda x: int(x[1:])).astype("float")
prod_df["overall_rating"] = prod_df["overall_rating"].astype("float")

def get_review_rating_counts(inp: str)-> dict[str, int]:
    buf = inp.split()
    return {"ratings":int(buf[0]),"reviews": int(buf[2])}

buff = prod_df["reviews_count"].apply(get_review_rating_counts)

prod_df["ratings"] = buff.apply(lambda x: x["ratings"]).astype("float")
prod_df["reviews"] = buff.apply(lambda x: x["reviews"]).astype("float")

prod_df.head()

Unnamed: 0,product_id,product_name,product_link,overall_rating,price,reviews_count,product_details,seller_name,ratings,reviews
0,4obtb7,"Hipbrat, Red/ MatteLiquid/ LipstickPack of 1",https://www.meesho.com/hipbrat-red-matteliquid...,3.8,103.0,"19451 Ratings, 6351 Reviews","Name : Hipbrat, Red/ MatteLiquid/ LipstickPack...",Not available,19451.0,6351.0
1,5k6fjf,Apple lipstick 2,https://www.meesho.com/apple-lipstick-2/p/5k6fjf,3.7,106.0,"10146 Ratings, 2525 Reviews",Name : Apple lipstick 2|Product Name : Apple l...,Not available,10146.0,2525.0
2,5b49ob,MARS Ultra Pigmented Super Soft Ultra Matte Li...,https://www.meesho.com/mars-ultra-pigmented-su...,4.1,350.0,"23253 Ratings, 8905 Reviews",Name : MARS Ultra Pigmented Super Soft Ultra M...,Not available,23253.0,8905.0
3,5utebe,Ronzille Long Lasting Waterproof Non Transfer ...,https://www.meesho.com/ronzille-long-lasting-w...,4.3,257.0,"86674 Ratings, 32852 Reviews",Name : Ronzille Long Lasting Waterproof Non Tr...,Not available,86674.0,32852.0
4,6jppoo,RED Edition Set of 12 Liquid Lipsticks Matte F...,https://www.meesho.com/red-edition-set-of-12-l...,3.9,197.0,"13710 Ratings, 4802 Reviews",Name : RED Edition Set of 12 Liquid Lipsticks ...,Not available,13710.0,4802.0


In [None]:
merged_df = pd.merge(prod_df, result, on="product_id", how="inner")

merged_df = merged_df.drop(
    labels=["product_link", "seller_name", "pos_freq", "neg_freq", "neu_freq", "tot_freq", "reviews_count"], axis=1
)

merged_df.head()

Unnamed: 0,product_id,product_name,overall_rating,price,product_details,ratings,reviews,user_score,top_reviews,pos_3,neg_3
0,4obtb7,"Hipbrat, Red/ MatteLiquid/ LipstickPack of 1",3.8,103.0,"Name : Hipbrat, Red/ MatteLiquid/ LipstickPack...",19451.0,6351.0,1.448782,"it is very good, it is water proof :red_heart:...","[shade, delivery, product]","[packaging, eyeliner, eyel]"
1,5k6fjf,Apple lipstick 2,3.7,106.0,Name : Apple lipstick 2|Product Name : Apple l...,10146.0,2525.0,2.723306,nice ?product beautiful it's really good quali...,"[quality, product, price]","[lipstick, damage, application]"
2,5b49ob,MARS Ultra Pigmented Super Soft Ultra Matte Li...,4.1,350.0,Name : MARS Ultra Pigmented Super Soft Ultra M...,23253.0,8905.0,2.87422,shades are so beautiful and so afortable and p...,"[shade, lipstick, product]","[glass, ote, coverage]"
3,5utebe,Ronzille Long Lasting Waterproof Non Transfer ...,4.3,257.0,Name : Ronzille Long Lasting Waterproof Non Tr...,86674.0,32852.0,2.000668,very nice very beautiful perfume. :cherry_blos...,"[colour, lipstick, product]","[quantity, box, size]"
4,6jppoo,RED Edition Set of 12 Liquid Lipsticks Matte F...,3.9,197.0,Name : RED Edition Set of 12 Liquid Lipsticks ...,13710.0,4802.0,2.064547,wow awesome I am very happy and unacceptable I...,"[colour, product, price]","[quantity, gel, amount]"


Calculate overall product sentiment

In [None]:
import numpy as np

merged_df["score"] = (
    merged_df["overall_rating"]
    * merged_df["user_score"]
    * (
        np.log(1 + merged_df["price"])
        + np.log(1 + merged_df["ratings"])
        + np.log(1 + merged_df["reviews"])
    )
)

merged_df.head()

Unnamed: 0,product_id,product_name,overall_rating,price,product_details,ratings,reviews,user_score,top_reviews,pos_3,neg_3,score,sentiment
0,4obtb7,"Hipbrat, Red/ MatteLiquid/ LipstickPack of 1",3.8,103.0,"Name : Hipbrat, Red/ MatteLiquid/ LipstickPack...",19451.0,6351.0,1.448782,"it is very good, it is water proof :red_heart:...","[shade, delivery, product]","[packaging, eyeliner, eyel]",128.146478,0
1,5k6fjf,Apple lipstick 2,3.7,106.0,Name : Apple lipstick 2|Product Name : Apple l...,10146.0,2525.0,2.723306,nice ?product beautiful it's really good quali...,"[quality, product, price]","[lipstick, damage, application]",218.978207,0
2,5b49ob,MARS Ultra Pigmented Super Soft Ultra Matte Li...,4.1,350.0,Name : MARS Ultra Pigmented Super Soft Ultra M...,23253.0,8905.0,2.87422,shades are so beautiful and so afortable and p...,"[shade, lipstick, product]","[glass, ote, coverage]",294.71948,0
3,5utebe,Ronzille Long Lasting Waterproof Non Transfer ...,4.3,257.0,Name : Ronzille Long Lasting Waterproof Non Tr...,86674.0,32852.0,2.000668,very nice very beautiful perfume. :cherry_blos...,"[colour, lipstick, product]","[quantity, box, size]",235.053503,0
4,6jppoo,RED Edition Set of 12 Liquid Lipsticks Matte F...,3.9,197.0,Name : RED Edition Set of 12 Liquid Lipsticks ...,13710.0,4802.0,2.064547,wow awesome I am very happy and unacceptable I...,"[colour, product, price]","[quantity, gel, amount]",187.534707,0


In [None]:
s_mean, s_std, s_max, s_min = merged_df["score"].mean(), merged_df["score"].std(), merged_df["score"].max(), merged_df["score"].min()
s_mean, s_std, s_max, s_min

(134.52813035938556, 86.60946868969714, 383.2617404538208, 37.558121832691135)

In [None]:
merged_df["score"] = (merged_df["score"] - s_mean) / s_std
merged_df["sentiment"] = merged_df["score"] >= 0
merged_df["sentiment"] = merged_df["sentiment"].astype(int)

merged_df.head(10)

Unnamed: 0,product_id,product_name,overall_rating,price,product_details,ratings,reviews,user_score,top_reviews,pos_3,neg_3,score,sentiment
0,4obtb7,"Hipbrat, Red/ MatteLiquid/ LipstickPack of 1",3.8,103.0,"Name : Hipbrat, Red/ MatteLiquid/ LipstickPack...",19451.0,6351.0,1.448782,"it is very good, it is water proof :red_heart:...","[shade, delivery, product]","[packaging, eyeliner, eyel]",-0.073683,0
1,5k6fjf,Apple lipstick 2,3.7,106.0,Name : Apple lipstick 2|Product Name : Apple l...,10146.0,2525.0,2.723306,nice ?product beautiful it's really good quali...,"[quality, product, price]","[lipstick, damage, application]",0.975067,1
2,5b49ob,MARS Ultra Pigmented Super Soft Ultra Matte Li...,4.1,350.0,Name : MARS Ultra Pigmented Super Soft Ultra M...,23253.0,8905.0,2.87422,shades are so beautiful and so afortable and p...,"[shade, lipstick, product]","[glass, ote, coverage]",1.849582,1
3,5utebe,Ronzille Long Lasting Waterproof Non Transfer ...,4.3,257.0,Name : Ronzille Long Lasting Waterproof Non Tr...,86674.0,32852.0,2.000668,very nice very beautiful perfume. :cherry_blos...,"[colour, lipstick, product]","[quantity, box, size]",1.160674,1
4,6jppoo,RED Edition Set of 12 Liquid Lipsticks Matte F...,3.9,197.0,Name : RED Edition Set of 12 Liquid Lipsticks ...,13710.0,4802.0,2.064547,wow awesome I am very happy and unacceptable I...,"[colour, product, price]","[quantity, gel, amount]",0.612018,1
5,4oywba,BEAUTY Professional Color Sensational Liquid L...,4.1,102.0,Name : BEAUTY Professional Color Sensational L...,12359.0,4882.0,2.082318,omg this is a great product I never thought th...,"[colour, quality, product]","[quantity, mast, size]",0.669633,1
6,6t642z,Seven Seas 2 in 1 Lipstick | Full Coverage | C...,4.2,146.0,Name : Seven Seas 2 in 1 Lipstick | Full Cover...,5232.0,2280.0,0.729163,it's good nude shade very nice I am happy good...,"[colour, lipstick, product]","[quantity, del, ivery]",-0.800623,0
7,5vndv2,"BROWN BLUSH Matte Mini, Combo Of Different Col...",4.0,121.0,"Name : BROWN BLUSH Matte Mini, Combo Of Differ...",29103.0,13993.0,2.193779,"I have this lipstick which is 8 pieces, it is ...","[shade, lipstick, product]","[calorie, delivery, size]",0.942097,1
8,529ktd,LIPSTIK PAKEOF 4 MATT,3.8,95.0,Name : LIPSTIK PAKEOF 4 MATT|Product Name : LI...,6250.0,1522.0,0.788468,the lipsticks are very pigmented good quality ...,"[colour, lipstick, product]","[size, mini, face]",-0.839483,0
9,59mkx0,Ronzille Long Lasting Hydrating Mini Lipstick-...,3.7,130.0,Name : Ronzille Long Lasting Hydrating Mini Li...,16242.0,6352.0,1.664312,very cute very beautiful but very very small :...,"[shade, color, product]","[size, dealer, face]",0.105304,1


Store Results

In [None]:
merged_df.to_csv("./data/scored_products.csv", index=False)

## Export Summaries for Task 2.3 and Task 3

In [None]:
def summarize_product(row):
    pos = ", ".join(row["pos_3"]) if isinstance(row["pos_3"], list) else str(row["pos_3"])
    neg = ", ".join(row["neg_3"]) if isinstance(row["neg_3"], list) else str(row["neg_3"])
    summary = (
        f"{row['product_name']} | {row['product_details']} | "
        f"Top Reviews: {row['top_reviews']} | "
        f"Positive Aspects: {pos} | Negative Aspects: {neg}"
    )
    return summary

summary_df = merged_df[["product_name", "product_details", "top_reviews", "pos_3", "neg_3", "sentiment"]].copy()
summary_df["prod_summary"] = summary_df.apply(summarize_product, axis=1)

# Keep only the 2 required columns
summary_df = summary_df[["prod_summary", "sentiment"]]

# View the result
summary_df.head()

Unnamed: 0,prod_summary,sentiment
0,"Hipbrat, Red/ MatteLiquid/ LipstickPack of 1 |...",0
1,Apple lipstick 2 | Name : Apple lipstick 2|Pro...,1
2,MARS Ultra Pigmented Super Soft Ultra Matte Li...,1
3,Ronzille Long Lasting Waterproof Non Transfer ...,1
4,RED Edition Set of 12 Liquid Lipsticks Matte F...,1


In [None]:
summary_df.to_csv("./data/prod_sentiment.csv", index=False)

# Task 2.3
#### Use `./data/prod_sentiment.csv`

IMport Libraries and Instantiate Model

In [25]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 1) Load tokenizer & model (will automatically place layers on GPU/CPU)
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model     = T5ForConditionalGeneration.from_pretrained(
    "google/flan-t5-base", device_map="auto"
)

device = next(model.parameters()).device  # e.g. "cuda:0"

Read Data

In [None]:
import pandas as pd
prod_df = pd.read_csv("./data/prod_sentiment.csv")
prod_df.head()

Unnamed: 0.1,Unnamed: 0,prod_summary,sentiment,gen_summary
0,0,"Hipbrat, Red/ MatteLiquid/ LipstickPack of 1 |...",0,"4 LIPSTIC ANDEYELINER KAJAL PACK. Hipbrat, Red..."
1,1,Apple lipstick 2 | Name : Apple lipstick 2|Pro...,1,"Red Mahroom Apple lipstick, Country of Origin ..."
2,2,MARS Ultra Pigmented Super Soft Ultra Matte Li...,1,Mars Super Soft Creamy Matte Lipstick is the u...
3,3,Ronzille Long Lasting Waterproof Non Transfer ...,1,Ronzille Long Lasting Waterproof Non Transfer ...
4,4,RED Edition Set of 12 Liquid Lipsticks Matte F...,1,Red Edition Set of 12 Liquid Lipsticks Matte F...


### Review Product

In [None]:
# 2) Define a helper that crafts a prompt, runs generation, and decodes
def analyze_product(text: str) -> str:
    # You can tweak the instruction any way you like:
    prompt = (
        "You are a product expert. "
        "Given the following product summary, write a brief analysis with pros, cons, "
        "and a final recommendation:\n\n"
        f"{text}"
    )

    # Tokenize (truncate/pad to fit T5’s 512‑token limit if needed)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=512,
        truncation=True,
    ).to(device)

    # Generate — you can adjust max_new_tokens, temperature, etc.
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )

    # Decode & strip special tokens
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

# 3) Apply it to your DataFrame
# replace `.apply` with `.progress_apply`
prod_df["model_review"] = prod_df["prod_summary"].progress_apply(analyze_product)

In [29]:
prod_df.head()

Unnamed: 0.1,Unnamed: 0,prod_summary,sentiment,gen_summary,model_review
0,0,"Hipbrat, Red/ MatteLiquid/ LipstickPack of 1 |...",0,"4 LIPSTIC ANDEYELINER KAJAL PACK. Hipbrat, Red...",The Hipbrat is a lipstick that's made of red m...
1,1,Apple lipstick 2 | Name : Apple lipstick 2|Pro...,1,"Red Mahroom Apple lipstick, Country of Origin ...",The Apple lipstick 2 is a waterproof lipstick....
2,2,MARS Ultra Pigmented Super Soft Ultra Matte Li...,1,Mars Super Soft Creamy Matte Lipstick is the u...,– This is a pretty good lipstick for a pricey ...
3,3,Ronzille Long Lasting Waterproof Non Transfer ...,1,Ronzille Long Lasting Waterproof Non Transfer ...,"Unlike many lipsticks in India, Ronzille Long ..."
4,4,RED Edition Set of 12 Liquid Lipsticks Matte F...,1,Red Edition Set of 12 Liquid Lipsticks Matte F...,This product is a great value for the money. I...


In [None]:
prod_df.to_csv("./data/prod_sentiment.csv", index=False)

In [32]:
prod_df["gen_summary"][0]

'4 LIPSTIC ANDEYELINER KAJAL PACK. Hipbrat, Red/ MatteLiquid/ LipstickPack of 1. The shade is good but I have given only 2 kajals, those too were not waterproof and the packaging was also not good so I have exchanged them best and good in less pric and easy to carry lipsticks eyeliner is also good but kajal is very bad. Negative Aspects: packaging, eyeliner, eyeliners.'

# Task 3 - Product Sentiment Clasification
#### Use `./data/prod_sentiment.csv`

In [33]:
def analyze_sentiment(text: str) -> str:
    # Short, clear instruction for classification
    prompt = (
        "Classify the sentiment of the following product review as Positive or Negative:\n\n"
        f"{text}"
    )

    # Tokenize the input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=512,
        truncation=True,
    ).to(device)

    # Generate a short, deterministic answer
    outputs = model.generate(
        **inputs,
        max_new_tokens=1,         # one word is enough
        do_sample=False,          # deterministic output
        temperature=0.0,          # avoid randomness
    )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return result

In [34]:
from tqdm.auto import tqdm
tqdm.pandas()

# 3) Apply it to your DataFrame
# replace `.apply` with `.progress_apply`
prod_df["pred_sentiment"] = prod_df["prod_summary"].progress_apply(analyze_sentiment)

  0%|          | 0/50 [00:00<?, ?it/s]



In [None]:
prod_df.to_csv("./data/prod_sentiment.csv", index=False)

In [40]:
prod_df.head()

Unnamed: 0,prod_summary,sentiment,model_review,pred_sentiment
0,"Hipbrat, Red/ MatteLiquid/ LipstickPack of 1 |...",0,The Hipbrat is a lipstick that's made of red m...,Positive
1,Apple lipstick 2 | Name : Apple lipstick 2|Pro...,1,The Apple lipstick 2 is a waterproof lipstick....,Neg
2,MARS Ultra Pigmented Super Soft Ultra Matte Li...,1,– This is a pretty good lipstick for a pricey ...,Positive
3,Ronzille Long Lasting Waterproof Non Transfer ...,1,"Unlike many lipsticks in India, Ronzille Long ...",Positive
4,RED Edition Set of 12 Liquid Lipsticks Matte F...,1,This product is a great value for the money. I...,Positive
