In [269]:
import pandas as pd
import json
import time
import random


In [270]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access environment variables
openai_key = os.getenv('OPENAI_API_KEY')


In [271]:
from openai import OpenAI

client = OpenAI()

In [272]:
    index = 0
    # Open the file containing the JSON data to index.
    with open('../data/flipkart_fashion_products_dataset.json', "r") as json_file:
        json_data = json.load(json_file)
        documents = []
        for doc in json_data:
            new_doc ={}
            new_doc['id'] = index
            new_doc["title"] = doc['title']
            new_doc["description"] = doc['description']
            new_doc["category"] = doc['category']
            new_doc["sub_category"] = doc['sub_category']
            new_doc["brand"] = doc['brand']
            new_doc["product_details"] = doc['product_details']
            # new_doc["product_details"] = {}
            # new_doc['product_details'] = doc['product_details']    
            # new_doc['selling_price'] = doc['selling_price']    
            # new_doc['actual_price'] = doc['actual_price'] 
            new_doc['average_rating'] = doc['average_rating'] 
            # new_doc['discount'] = doc['discount'] 
            new_doc['url'] = doc['url'] 
            new_doc['out_of_stock'] = doc['out_of_stock']
            title_and_description = doc['title'] + doc['description']

            documents.append(new_doc)
            index = index + 1

            # if index == 20:
            #     break

In [146]:
product_list = []
documents_filename = '../data/flipkart_fashion_products_dataset.json'
with open(documents_filename, "r") as json_file:
    json_data = json.load(json_file)
    for product in json_data:
        product_list.append(product)

unique_products = {}
for product in product_list:
    product_id = product['pid']
    if product_id not in unique_products:
        unique_products[product_id] = product

In [231]:
def sample_documents(unique_products, sample_size):  
    documents = []
    index = 0
    for key, doc in unique_products.items():
        new_doc ={}
        new_doc["title"] = doc['title']
        new_doc["description"] = doc['description']
        new_doc["category"] = doc['category']
        new_doc["sub_category"] = doc['sub_category']
        new_doc["brand"] = doc['brand']
        new_doc['doc_id'] = index
        new_doc['product_details'] = doc['product_details'][0]    
        new_doc['average_rating'] = doc['average_rating'] 
        new_doc['out_of_stock'] = doc['out_of_stock']
        title_and_description = doc['title'] + doc['description']
        index += 1
        
        documents.append(new_doc)

    # Sample  items from the list
    sampled_products = random.sample(documents, sample_size)
    return sampled_products


sample_products = sample_documents(unique_products, 3)
    

In [232]:
sample_products

[{'title': 'Men Regular Fit Solid Casual Shirt',
  'description': '',
  'category': 'Clothing and Accessories',
  'sub_category': 'Topwear',
  'brand': 'Mett',
  'doc_id': 20668,
  'product_details': {'Pack of': '1'},
  'average_rating': '',
  'out_of_stock': True},
 {'title': 'Pure Silk Solid Gold Men Dupatta',
  'description': '',
  'category': 'Clothing and Accessories',
  'sub_category': 'Clothing Accessories',
  'brand': 'Paridhanlok Onli',
  'doc_id': 19588,
  'product_details': {'Style Code': 'ST010'},
  'average_rating': '4.1',
  'out_of_stock': False},
 {'title': 'Men Ankle Length\xa0\xa0(Pack of 3)',
  'description': 'Welwear multicolor ankle cotton all time  socks',
  'category': 'Clothing and Accessories',
  'sub_category': 'Clothing Accessories',
  'brand': 'Welwe',
  'doc_id': 16502,
  'product_details': {'Color': 'Multicolor'},
  'average_rating': '3',
  'out_of_stock': False}]

In [233]:
keys = ['title', 'description', 'category', 'sub_category', 'brand', 'average_rating', 'url', 'out_of_stock']
for product in sample_products:
    for key in keys:
        if key not in product:
            product[key] = ''

In [234]:
prompt_template = """
You emulate the user of an ecommerce product search assistant.
Formulate 5 questions this user might ask based on the provided product.
Make the questions specific to the details of this product. Each product can be uniquely identified by the combination of title and brand. 
The question should clearly mention the product it is talking about.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

title: {title}
description: {description}
category: {category}
sub_category: {sub_category}
brand: {brand}
average rating: {average_rating}
url: {url}
out_of_stock: {out_of_stock}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2"]}}
""".strip()

In [235]:
prompt = prompt_template.format(**sample_products[0])


In [236]:
print(prompt)


You emulate the user of an ecommerce product search assistant.
Formulate 5 questions this user might ask based on the provided product.
Make the questions specific to the details of this product. Each product can be uniquely identified by the combination of title and brand. 
The question should clearly mention the product it is talking about.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

title: Men Regular Fit Solid Casual Shirt
description: 
category: Clothing and Accessories
sub_category: Topwear
brand: Mett
average rating: 
url: 
out_of_stock: True

Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2"]}


In [237]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [238]:
questions = llm(prompt)


In [239]:
json.loads(questions)


{'questions': ['Is the Men Regular Fit Solid Casual Shirt by Mett currently available for purchase?',
  'What is the fit style of the Men Regular Fit Solid Casual Shirt from Mett?',
  'In which category does the Men Regular Fit Solid Casual Shirt by Mett fall?',
  'Can you tell me more about the color options for the Men Regular Fit Solid Casual Shirt by Mett?',
  'What type of occasions is the Men Regular Fit Solid Casual Shirt by Mett suitable for?']}

In [257]:
def generate_questions(doc):
    print('Generating question')
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [241]:
from tqdm.auto import tqdm


In [267]:

sample_products

[{'title': 'Men Regular Fit Solid Casual Shirt',
  'description': '',
  'category': 'Clothing and Accessories',
  'sub_category': 'Topwear',
  'brand': 'Mett',
  'doc_id': 20668,
  'product_details': {'Pack of': '1'},
  'average_rating': '',
  'out_of_stock': True,
  'url': ''},
 {'title': 'Pure Silk Solid Gold Men Dupatta',
  'description': '',
  'category': 'Clothing and Accessories',
  'sub_category': 'Clothing Accessories',
  'brand': 'Paridhanlok Onli',
  'doc_id': 19588,
  'product_details': {'Style Code': 'ST010'},
  'average_rating': '4.1',
  'out_of_stock': False,
  'url': ''},
 {'title': 'Men Ankle Length\xa0\xa0(Pack of 3)',
  'description': 'Welwear multicolor ankle cotton all time  socks',
  'category': 'Clothing and Accessories',
  'sub_category': 'Clothing Accessories',
  'brand': 'Welwe',
  'doc_id': 16502,
  'product_details': {'Color': 'Multicolor'},
  'average_rating': '3',
  'out_of_stock': False,
  'url': ''}]

In [268]:
def generate_questions(products):
    results = {}
    for doc in tqdm(products): 
        doc_id = doc['doc_id']
        print(f'processing doc_id = {doc_id}')
        # if doc_id in results:
        #     continue
    
        questions_raw = generate_questions(doc)
        # print(f'Generated questions')
        questions = json.loads(questions_raw)
        print(questions)
        print('******************')
        # time.sleep(100)
        # results[doc_id] = {}
        # results[doc_id]['questions'] = questions['questions']
        # results[doc_id]['category'] = doc['category']
        # results[doc_id]['brand'] = doc['brand']
        # print(f'Finished processing doc id = {doc_id}')
    return results

results = generate_questions(sample_products)
        

  0%|          | 0/3 [00:00<?, ?it/s]

processing doc_id = 20668


  0%|          | 0/10 [00:00<?, ?it/s]

TypeError: string indices must be integers

In [260]:
results

{}

In [132]:
final_results = []

for doc in results.items():
    doc_id = doc[0]
    question = doc[1]['questions']
    category = doc[1]['category']
    brand = doc[1]['brand']
    final_results.append([doc_id, question, category, brand])

In [135]:
final_results[19]


[19,
 ['What materials are used in the York Solid Men Multicolor Track Pants for comfort and skin-friendliness?',
  'Is the York Solid Men Multicolor Track Pants available for purchase right now?'],
 'Clothing and Accessories',
 'York']

In [136]:
# Create a DataFrame
df = pd.DataFrame(final_results, columns=['doc_id', 'questions', 'category', 'brand'])

# Use explode() to transform the 'questions' list into separate rows
df_exploded = df.explode('questions').reset_index(drop=True)



In [137]:
df_exploded.shape


(40, 4)

In [138]:
df_exploded

Unnamed: 0,doc_id,questions,category,brand
0,0,What are the material and comfort features of ...,Clothing and Accessories,York
1,0,Is the Solid Men Multicolor Track Pants by Yor...,Clothing and Accessories,York
2,1,What materials are used to make the Solid Men ...,Clothing and Accessories,York
3,1,Is the waistband of the Yorker track pants des...,Clothing and Accessories,York
4,2,What material is used in the York Solid Men Mu...,Clothing and Accessories,York
5,2,Is the waistband of the York trackpants itch-f...,Clothing and Accessories,York
6,3,What materials are used in the Yorker Solid Me...,Clothing and Accessories,York
7,3,Are the Yorker track pants comfortable for all...,Clothing and Accessories,York
8,4,What materials are used in the York Solid Men ...,Clothing and Accessories,York
9,4,Can you tell me about the sizing options avail...,Clothing and Accessories,York


In [139]:
df_exploded.to_csv('../data/ground-truth-retrieval.csv', index=False)


In [140]:
!head ../data/ground-truth-retrieval.csv


doc_id,questions,category,brand
0,What are the material and comfort features of the Solid Men Multicolor Track Pants by York?,Clothing and Accessories,York
0,Is the Solid Men Multicolor Track Pants by York suitable for year-round use?,Clothing and Accessories,York
1,What materials are used to make the Solid Men Blue Track Pants by York?,Clothing and Accessories,York
1,Is the waistband of the Yorker track pants designed to be comfortable for all-day wear?,Clothing and Accessories,York
2,What material is used in the York Solid Men Multicolor Track Pants and how does it enhance comfort?,Clothing and Accessories,York
2,"Is the waistband of the York trackpants itch-free, and are they suitable for all-year-round use?",Clothing and Accessories,York
3,What materials are used in the Yorker Solid Men Multicolor Track Pants?,Clothing and Accessories,York
3,Are the Yorker track pants comfortable for all-year round use?,Clothing and Accessories,York
4,"What materials are used in the York Solid Men 