## Data Loading and Prepping

In [None]:
%%capture
!pip install -q transformers datasets pandas
!pip install -q transformers accelerate evaluate bert_score nltk rouge_score textstat

In [None]:
import pandas as pd
import os
import kagglehub

path = kagglehub.dataset_download("piyushjain16/amazon-product-data")

df=pd.read_csv(path+"/dataset/train.csv")
df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [None]:
df.columns

Index(['PRODUCT_ID', 'TITLE', 'BULLET_POINTS', 'DESCRIPTION',
       'PRODUCT_TYPE_ID', 'PRODUCT_LENGTH'],
      dtype='object')

In [None]:
# converting to lowercase for convenience
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,product_id,title,bullet_points,description,product_type_id,product_length
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [None]:
df.info()
print('---'*20)
print(df.describe())
print('---'*20)
print('Dataset shape:',df.shape)
print('---'*20)
print('Missing Values:')
df.isnull().sum().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249698 entries, 0 to 2249697
Data columns (total 6 columns):
 #   Column           Dtype  
---  ------           -----  
 0   product_id       int64  
 1   title            object 
 2   bullet_points    object 
 3   description      object 
 4   product_type_id  int64  
 5   product_length   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 103.0+ MB
------------------------------------------------------------
         product_id  product_type_id  product_length
count  2.249698e+06     2.249698e+06    2.249698e+06
mean   1.499795e+06     4.000456e+03    4.071839e+03
std    8.661944e+05     3.966146e+03    1.351685e+06
min    1.000000e+00     0.000000e+00    1.000000e+00
25%    7.494795e+05     2.300000e+02    5.118110e+02
50%    1.499558e+06     2.916000e+03    6.630000e+02
75%    2.250664e+06     6.403000e+03    1.062992e+03
max    2.999999e+06     1.342000e+04    1.885801e+09
----------------------------------------------

Unnamed: 0,0
description,1157382
bullet_points,837366
title,13
product_id,0
product_type_id,0
product_length,0


In [None]:
# handling missing values
print("Shape before dropping missing values:", df.shape)
df = df.dropna()
print("Shape after dropping missing values:", df.shape)

Shape before dropping missing values: (2249698, 6)
Shape after dropping missing values: (1038458, 6)


In [None]:
# Filter dataset and sample
df = df[['title', 'description', 'bullet_points', 'product_type_id', 'product_length']].dropna()
df = df.sample(50, random_state=42).reset_index(drop=True)

In [None]:
df.shape

(50, 5)

## Baseline Model

In [None]:
# Reference descriptions
refs = df['description'].fillna("").tolist()

# 🧱 Baseline description generation
def generate_baseline_description(row):
    title = str(row['title']) if pd.notnull(row['title']) else "Unknown Product"
    bullet_points = str(row['bullet_points']) if pd.notnull(row['bullet_points']) else "No details available"
    product_type = str(row['product_type_id']) if pd.notnull(row['product_type_id']) else "item"
    length = str(row['product_length']) if pd.notnull(row['product_length']) else "unspecified"

    description = (
        f"This {product_type} titled '{title}' features: {bullet_points}. "
        f"The product length is {length}."
    )
    return description

# Apply baseline generation
df['generated_baseline'] = df.apply(generate_baseline_description, axis=1)
gens_baseline = df['generated_baseline'].fillna("").tolist()

In [None]:
df[['title', 'description','generated_baseline']]

Unnamed: 0,title,description,generated_baseline
0,"Plane Light System, Plastic + Metal Taxi Light...",Features:&nbsp;<br> Full set of bright LED lig...,"This 1149 titled 'Plane Light System, Plastic ..."
1,"DECOR Kafe Home Decor Sunflower Wall Sticker, ...",Welcome To The Foremost Place On The Web To Fi...,This 6030 titled 'DECOR Kafe Home Decor Sunflo...
2,Vbuyz Women's Rayon Foil Print Stitched Straig...,Vbuyz women's green color rayon straight kurti...,This 2916 titled 'Vbuyz Women's Rayon Foil Pri...
3,Mitsui Shop on Suruga Street in Edo by Katsush...,<p></p><br><p>Lost Cabin Art & Decor wall deco...,This 6548 titled 'Mitsui Shop on Suruga Street...
4,"Brass Glass ( 1 pcs ), 250ml (Glass with Doubl...","Specification:-Set for 1, Material: Brass, Vol...","This 1416 titled 'Brass Glass ( 1 pcs ), 250ml..."
5,"EcoSmart PolyPaper Cutting Board, White, 12"" b...",ECOCELL16 Features: -Cutting board. -Made from...,This 1463 titled 'EcoSmart PolyPaper Cutting B...
6,Genuine Mazda Parts BBM6-56-620 Hood Latch,MAZDA BBM656620 GENUINE OEM LOCK,This 7254 titled 'Genuine Mazda Parts BBM6-56-...
7,Candere by Kalyan Jewellers BIS Hallmark 14kt ...,It is perfect for the working woman who wants ...,This 3368 titled 'Candere by Kalyan Jewellers ...
8,"Mini 360° Panoramic Tripod Ball Head, 1/4"" Scr...",<b>Specification:</b><br> Material: Aluminum A...,This 2046 titled 'Mini 360° Panoramic Tripod B...
9,Habeeb Bags Boy's and Girl's 13 Inches Leather...,Looking for a vintage messenger bag so worry n...,This 3419 titled 'Habeeb Bags Boy's and Girl's...


In [None]:
# vision for product descriptions

for i in range(5):
    print(f"Product Title:\n{df.loc[i, 'title']}")
    print(f"\n- Original Description:\n{df.loc[i, 'description']}")
    print(f"\n-- Generated Desc by Baseline:\n{df.loc[i, 'generated_baseline']}")
    print('---'*20)

Product Title:
Plane Light System, Plastic + Metal Taxi Lights Airplane LED Light, for Model Plane

- Original Description:
Features:&nbsp;<br> Full set of bright LED lights for your model plane or glider. <br> Use the power supplied by the receiver and don't require its own battery. <br> The circuitry has been specially designed so that the landing lights and taxi lights can be switched on and off directly. <br> Either use a Y-harness and link it to another channel such as the gear or assign a separate channel to it. <br> It includes red beacon light, white taxi light, white strobe light, white landing light, green navigation light, red navigation light. <br> <br>Specification:&nbsp;<br>Material: Plastic + Metal<br>Color: Shown As Pictures<br>Weight: 111g<br>Type: RC Part &amp; Accessory<br>Control Board Size: Approx. 54 * 35 * 13mm / 2.1 * 1.4 * 0.5inch<br> Cable Size: Approx. OD 0.8mm*0.8m <br> LED Quantity: 14pcs <br>Lighting System Specification:<br>2 x Red Beacon Lights (#11, #12

In [None]:
!pip install textstat
from evaluate import load
import numpy as np
from textstat import flesch_reading_ease

# Load metrics once
bleu = load("bleu")
rouge = load("rouge")
bertscore = load("bertscore")

def evaluate_model(df, gen_col, ref_col="description", lang="en"):
    # Handle missing values
    refs = df[ref_col].fillna("").tolist()
    gens = df[gen_col].fillna("").tolist()

    # BLEU
    bleu_score = bleu.compute(predictions=gens, references=[[r] for r in refs])['bleu']

    # ROUGE (we'll extract rouge1 and rouge2 only)
    rouge_scores = rouge.compute(predictions=gens, references=refs)
    rouge1 = rouge_scores['rouge1']
    rouge2 = rouge_scores['rouge2']

    # BERTScore (F1 average)
    bert = bertscore.compute(predictions=gens, references=refs, lang=lang)
    bert_f1 = np.mean(bert['f1'])

    # Flesch Reading Ease Score (average over generated texts)
    flesch_scores = [flesch_reading_ease(text) for text in gens]
    flesch_avg = np.mean(flesch_scores)

    # Print clean results
    print(f"\n🧠 Evaluation for Model: {gen_col}")
    print(f"BLEU Score         : {bleu_score:.4f}")
    print(f"ROUGE-1 Score      : {rouge1:.4f}")
    print(f"ROUGE-2 Score      : {rouge2:.4f}")
    print(f"BERTScore-F1       : {bert_f1:.4f}")
    print(f"Flesch Reading Ease: {flesch_avg:.2f}")

    return {
        "BLEU": bleu_score,
        "ROUGE-1": rouge1,
        "ROUGE-2": rouge2,
        "BERTScore-F1": bert_f1,
        "Flesch": flesch_avg
    }




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
results_baseline = evaluate_model(df, gen_col='generated_baseline')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)



🧠 Evaluation for Model: generated_baseline
BLEU Score         : 0.1059
ROUGE-1 Score      : 0.2883
ROUGE-2 Score      : 0.1237
BERTScore-F1       : 0.8294
Flesch Reading Ease: 48.32


## Generate Descriptions with a Transformer Model

In [None]:
def build_zero_shot_prompt(row):
    return (
        f"Generate a high-quality Amazon product description using the following metadata:\n\n"
        f"Title: {row['title']}\n"
        f"Bullet Points: {row['bullet_points']}\n"
        f"Product Type ID: {row['product_type_id']}\n"
        f"Product Length: {row['product_length']}\n\n"
        f"Description:"
    )


In [None]:
from transformers import pipeline

# Load FLAN-T5 base (lightweight and Colab-friendly)
flan_generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_new_tokens=150,
    device_map="auto"  # Automatically uses GPU if available
)

def generate_description_flan(row):
    prompt = (
        f"Generate an Amazon product description using the following metadata:\n"
        f"Title: {row['title']}\n"
        f"Bullet Points: {row['bullet_points']}\n"
        f"Product Type ID: {row['product_type_id']}\n"
        f"Product Length: {row['product_length']}"
    )
    try:
        result = flan_generator(prompt)
        return result[0]['generated_text'].strip()
    except Exception as e:
        print("FLAN error:", e)
        return None

# Save output in the correct column for evaluation
df['generated_flan'] = df.apply(generate_description_flan, axis=1)


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
df[['title', 'description','generated_flan']]

Unnamed: 0,title,description,generated_flan
0,"Plane Light System, Plastic + Metal Taxi Light...",Features:&nbsp;<br> Full set of bright LED lig...,"Plane Light System, Plastic + Metal Taxi Light..."
1,"DECOR Kafe Home Decor Sunflower Wall Sticker, ...",Welcome To The Foremost Place On The Web To Fi...,"DECOR Kafe Home Decor Sunflower Wall Sticker, ..."
2,Vbuyz Women's Rayon Foil Print Stitched Straig...,Vbuyz women's green color rayon straight kurti...,Vbuyz Women's Rayon Foil Print Stitched Straig...
3,Mitsui Shop on Suruga Street in Edo by Katsush...,<p></p><br><p>Lost Cabin Art & Decor wall deco...,Framed Canvas Wall Art Decor | Fine Artwork Pa...
4,"Brass Glass ( 1 pcs ), 250ml (Glass with Doubl...","Specification:-Set for 1, Material: Brass, Vol...","[Specification:-Set fo 1, Material: Brass, Vol..."
5,"EcoSmart PolyPaper Cutting Board, White, 12"" b...",ECOCELL16 Features: -Cutting board. -Made from...,"EcoSmart PolyPaper Cutting Board, White, 12"" b..."
6,Genuine Mazda Parts BBM6-56-620 Hood Latch,MAZDA BBM656620 GENUINE OEM LOCK,Genuine Mazda Parts BBM6-56-620 Hood Latch Bul...
7,Candere by Kalyan Jewellers BIS Hallmark 14kt ...,It is perfect for the working woman who wants ...,14kt Yellow Gold Ring for Women Bullet Points:...
8,"Mini 360° Panoramic Tripod Ball Head, 1/4"" Scr...",<b>Specification:</b><br> Material: Aluminum A...,"Mini 360° Panoramic Tripod Ball Head, 1/4"" Scr..."
9,Habeeb Bags Boy's and Girl's 13 Inches Leather...,Looking for a vintage messenger bag so worry n...,"[Stylish and light weight , durable canvas lin..."


In [None]:
# vision for product descriptions

for i in range(5):
    print(f"Product Title:\n{df.loc[i, 'title']}")
    print(f"\n- Original Description:\n{df.loc[i, 'description']}")
    print(f"\n-- Generated Desc by FLAN T5:\n{df.loc[i, 'generated_flan']}")
    print('---'*20)

Product Title:
Plane Light System, Plastic + Metal Taxi Lights Airplane LED Light, for Model Plane

- Original Description:
Features:&nbsp;<br> Full set of bright LED lights for your model plane or glider. <br> Use the power supplied by the receiver and don't require its own battery. <br> The circuitry has been specially designed so that the landing lights and taxi lights can be switched on and off directly. <br> Either use a Y-harness and link it to another channel such as the gear or assign a separate channel to it. <br> It includes red beacon light, white taxi light, white strobe light, white landing light, green navigation light, red navigation light. <br> <br>Specification:&nbsp;<br>Material: Plastic + Metal<br>Color: Shown As Pictures<br>Weight: 111g<br>Type: RC Part &amp; Accessory<br>Control Board Size: Approx. 54 * 35 * 13mm / 2.1 * 1.4 * 0.5inch<br> Cable Size: Approx. OD 0.8mm*0.8m <br> LED Quantity: 14pcs <br>Lighting System Specification:<br>2 x Red Beacon Lights (#11, #12

In [None]:
results_flan = evaluate_model(df, gen_col='generated_flan')

  return forward_call(*args, **kwargs)



🧠 Evaluation for Model: generated_flan
BLEU Score         : 0.0632
ROUGE-1 Score      : 0.2481
ROUGE-2 Score      : 0.1019
BERTScore-F1       : 0.8225
Flesch Reading Ease: 33.93
