In [2]:
from google.colab import drive # code to access google drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Step 5.1: Load Data

In [3]:
# STEP 0 â€” Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("/content/gdrive/MyDrive/Machine Learning Dataset and project /ecommerce_products_sales.csv")
df.head()


Unnamed: 0,product_id,title,description,category,price,quantity,order_date,region
0,P0000000,Within finish Republican.,This sports product is made of Cotton and offe...,Sports,246.57,2,2024-02-07,Bahia
1,P0000001,Remember leave family bed doctor agreement.,This electronics product is made of Aluminum a...,Electronics,268.05,3,2023-10-11,SÃ£o Paulo
2,P0000002,Think article well behavior natural.,This electronics product is made of PU Leather...,Electronics,289.75,2,2021-01-29,SÃ£o Paulo
3,P0000003,Worker writer person various question election...,This phone accessories product is made of Stee...,Phone Accessories,343.24,5,2023-05-06,Minas Gerais
4,P0000004,Stock learn lawyer quite next.,This books product is made of Cotton and offer...,Books,13.08,5,2024-07-16,Rio Grande do Sul


Step 5.2 â€” Identify LLM-Eligible Columns

In [4]:
text_columns = ["title", "description"]
df[text_columns].isnull().mean()


Unnamed: 0,0
title,0.0
description,0.0


Step 5.3 â€” Define LLM Prompt (CRITICAL)

This is what separates professionals from amateurs.
  Prompt Design (Reusable & Controlled)

In [5]:
SYSTEM_PROMPT = """
You are a data assistant for an e-commerce analytics platform.
Your task is to extract structured attributes from product text.
Return concise, consistent outputs.
"""


In [6]:
USER_PROMPT_TEMPLATE = """
Product Title: {title}
Product Description: {description}

Tasks:
1. Assign a clean normalized category (max 2 words)
2. Generate 3 short product tags
3. Write a 1-line product summary
4. Indicate if product is premium (yes/no)

Return JSON only.
"""


In [17]:
class MockLLM:
    def generate(self, system_prompt, user_prompt):
        # Deterministic, explainable output
        return """
        {
          "normalized_category": "electronics",
          "product_tags": ["portable", "durable", "daily-use"],
          "product_summary": "Reliable electronic product suitable for everyday use.",
          "is_premium_product": "no"
        }
        """


In [21]:
llm = MockLLM()


ðŸ”¹ Step 5.4 â€” LLM Call Function (Pseudo-Production)

In [22]:
def extract_genai_features(title, description):
    prompt = USER_PROMPT_TEMPLATE.format(
        title=title,
        description=description
    )

    # PSEUDO-CODE (provider-agnostic)
    response = llm.generate(
        system_prompt=SYSTEM_PROMPT,
        user_prompt=prompt
    )

    return response


Step 5.5 â€” Apply on Sample

In [23]:
sample_df = df.sample(20, random_state=42)


In [24]:
genai_outputs = sample_df.apply(
    lambda row: extract_genai_features(row["title"], row["description"]),
    axis=1
)


In [27]:
sample_df = df.sample(20, random_state=42)

genai_outputs = sample_df.apply(
    lambda row: extract_genai_features(row["title"], row["description"]),
    axis=1
)

genai_outputs.head()


Unnamed: 0,0
59770,"\n {\n ""normalized_category"": ..."
21362,"\n {\n ""normalized_category"": ..."
127324,"\n {\n ""normalized_category"": ..."
140509,"\n {\n ""normalized_category"": ..."
144297,"\n {\n ""normalized_category"": ..."


Step 5.6 â€” Parse Output into Columns

In [28]:
{
  "normalized_category": "electronics",
  "product_tags": ["wireless", "bluetooth", "portable"],
  "product_summary": "Portable wireless speaker with deep bass.",
  "is_premium_product": "yes"
}


{'normalized_category': 'electronics',
 'product_tags': ['wireless', 'bluetooth', 'portable'],
 'product_summary': 'Portable wireless speaker with deep bass.',
 'is_premium_product': 'yes'}

In [29]:
import json

parsed = genai_outputs.apply(json.loads)

genai_df = pd.json_normalize(parsed)
genai_df.head()


Unnamed: 0,normalized_category,product_tags,product_summary,is_premium_product
0,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no
1,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no
2,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no
3,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no
4,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no


Step 5.7 â€” Merge Back to Dataset

In [30]:
df_enriched = sample_df.reset_index(drop=True).join(genai_df)
df_enriched.head()


Unnamed: 0,product_id,title,description,category,price,quantity,order_date,region,normalized_category,product_tags,product_summary,is_premium_product
0,P0059770,Water through man whatever glass stop actually...,This books product is made of Cotton and offer...,Books,300.54,4,2024-07-16,Rio de Janeiro,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no
1,P0021362,Action model other hour little value.,This books product is made of PU Leather and o...,Books,446.46,3,2022-01-07,SÃ£o Paulo,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no
2,P0127324,Through Democrat know enjoy create outside wit...,This books product is made of Aluminum and off...,Books,453.12,1,2022-03-07,SÃ£o Paulo,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no
3,P0140509,Firm body make especially if tree.,This electronics product is made of Aluminum a...,Electronics,74.77,5,2021-09-09,Bahia,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no
4,P0144297,Often decide front high Democrat.,This electronics product is made of PU Leather...,Electronics,120.15,1,2021-08-01,Rio de Janeiro,electronics,"[portable, durable, daily-use]",Reliable electronic product suitable for every...,no


In [None]:
for col in df.columns:
    print(f"'{col}'")


'product_id'
'title'
'description'
'category'
'price'
'quantity'
'order_date'
'region'
