In [2]:
import gzip
import shutil
import os
import json
import csv

## Extracting the image folder

In [3]:
with gzip.open('abo-images-small/images/metadata/images.csv.gz', 'rb') as f_in:
    with open('abo-images-small/images/metadata/images.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

print("Extraction complete.")

Extraction complete.


## Extracting the image metadata listings

In [None]:
# This script extracts .json.gz files from the input directory and saves the decompressed files 
# to the output directory without the .gz extension.

import os
import gzip
import shutil

input_dir = 'abo-listings/listings/metadata/'
output_dir = 'abo-listings/listings/extracted_metadata/'

os.makedirs(output_dir, exist_ok=True)  
for filename in os.listdir(input_dir):
    if filename.endswith('.json.gz'):  # Process only .json.gz files
        input_path = os.path.join(input_dir, filename)
        output_filename = filename[:-3]  # Remove the .gz extension from the filename
        output_path = os.path.join(output_dir, output_filename)

        # Decompress the file
        with gzip.open(input_path, 'rb') as f_in:
            with open(output_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out) 
        
        print(f"Extracted {filename} → {output_filename}")

print("All extractions complete.")


Extracted listings_b.json.gz → listings_b.json
Extracted listings_2.json.gz → listings_2.json
Extracted listings_c.json.gz → listings_c.json
Extracted listings_3.json.gz → listings_3.json
Extracted listings_a.json.gz → listings_a.json
Extracted listings_1.json.gz → listings_1.json
Extracted listings_8.json.gz → listings_8.json
Extracted listings_9.json.gz → listings_9.json
Extracted listings_0.json.gz → listings_0.json
Extracted listings_5.json.gz → listings_5.json
Extracted listings_e.json.gz → listings_e.json
Extracted listings_4.json.gz → listings_4.json
Extracted listings_d.json.gz → listings_d.json
Extracted listings_6.json.gz → listings_6.json
Extracted listings_f.json.gz → listings_f.json
Extracted listings_7.json.gz → listings_7.json
All extractions complete.


In [None]:
# the extraction is done for all the json listing files

json_dir = 'abo-listings/listings/extracted_metadata/'

for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        file_path = os.path.join(json_dir, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                
                if content.startswith('['):
                    data = json.loads(content)
                    print(f"{filename} loaded as JSON array. Length: {len(data)}")
                else:
                    items = []
                    for line in content.splitlines():
                        line = line.strip()
                        if line:
                            items.append(json.loads(line))
                    print(f"{filename} loaded as {len(items)} separate JSON objects (line by line).")
                    
        except json.JSONDecodeError as e:
            print(f"{filename} failed to load: {e}")
        except Exception as e:
            print(f"{filename} encountered an error: {e}")

print("JSON structure check complete.")

listings_f.json loaded as 9222 separate JSON objects (line by line).
listings_1.json loaded as 9232 separate JSON objects (line by line).
listings_0.json loaded as 9232 separate JSON objects (line by line).
listings_7.json loaded as 9232 separate JSON objects (line by line).
listings_a.json loaded as 9232 separate JSON objects (line by line).
listings_6.json loaded as 9232 separate JSON objects (line by line).
listings_5.json loaded as 9232 separate JSON objects (line by line).
listings_9.json loaded as 9232 separate JSON objects (line by line).
listings_b.json loaded as 9232 separate JSON objects (line by line).
listings_c.json loaded as 9232 separate JSON objects (line by line).
listings_8.json loaded as 9232 separate JSON objects (line by line).
listings_4.json loaded as 9232 separate JSON objects (line by line).
listings_d.json loaded as 9232 separate JSON objects (line by line).
listings_3.json loaded as 9232 separate JSON objects (line by line).
listings_2.json loaded as 9232 sep

In [None]:
## this was just done to understand the structure of each record in the metadata listing json file

file_path = 'abo-listings/listings/extracted_metadata/listings_1.json'

print(f"Inspecting {file_path}")

with open(file_path, 'r', encoding='utf-8') as f:
    raw = f.read()

print("\n--- First 500 characters ---")
print(raw[:500])
print("\n---------------------------")

try:
    data = json.loads(raw)
    print(" Loaded as single JSON object")
    print(f"Type: {type(data)}")
    if isinstance(data, dict):
        print(f"Top-level keys: {list(data.keys())}")
    elif isinstance(data, list):
        print(f"List length: {len(data)}")
        print(f"First item type: {type(data[0])}")
        if isinstance(data[0], dict):
            print(f"First item keys: {list(data[0].keys())}")
except json.JSONDecodeError as e:
    print(f"Failed to load as single JSON: {e}")

    print("\nTrying to parse line by line...")
    items = []
    for line in raw.splitlines():
        line = line.strip()
        if line:
            try:
                obj = json.loads(line)
                items.append(obj)
            except Exception as sub_e:
                print(f"Failed to parse line: {sub_e}")
                break

    print(f"Parsed {len(items)} JSON objects (line by line)")
    if items:
        print(f"First item type: {type(items[0])}")
        if isinstance(items[0], dict):
            print(f"First item keys: {list(items[0].keys())}")


Inspecting abo-listings/listings/extracted_metadata/listings_1.json

--- First 500 characters ---
{"brand": [{"language_tag": "en_IN", "value": "Amazon Brand - Solimo"}], "bullet_point": [{"language_tag": "en_IN", "value": "3D Printed Hard Back Case Mobile Cover for Coolpad Cool1 dual"}, {"language_tag": "en_IN", "value": "Easy to put & take off with perfect cutouts for volume buttons, audio & charging ports."}, {"language_tag": "en_IN", "value": "Stylish design and appearance, express your unique personality."}, {"language_tag": "en_IN", "value": "Extreme precision design allows easy access

---------------------------
Failed to load as single JSON: Extra data: line 2 column 1 (char 7886)

Trying to parse line by line...
Parsed 9232 JSON objects (line by line)
First item type: <class 'dict'>
First item keys: ['brand', 'bullet_point', 'color', 'item_id', 'item_name', 'item_weight', 'model_name', 'model_number', 'product_type', 'main_image_id', 'other_image_id', 'item_keywords', 'countr

## Script for Extracting and Filtering Metadata from JSON Files

### This script processes JSON files containing product metadata, filters relevant fields based on specific criteria, and extracts descriptions for generating prompts. The filtered data is saved into a CSV file for further use. It ensures only relevant records with required fields and country-specific data (IN/US) are retained and saved.

In [None]:

## here, we extract the relevant fields from each record from the json to be included in the final csv which is used for generating the prompts

input_dir = 'abo-listings/listings/extracted_metadata'
output_dir = 'abo-listings/listings/filtered_metadata'

os.makedirs(output_dir, exist_ok=True)

header = [
    'main_image_id', 'overall_description', 'colour_description', 'other_description', 'material_description'
]

# Function to filter 'value' fields by language_tag (or accept if no language_tag)
def get_filtered_values(entries):
    filtered_values = []
    for entry in entries:
        value = entry.get('value')
        language_tag = entry.get('language_tag')
        if value and (language_tag is None or language_tag in ['en_IN', 'en_US']):
            filtered_values.append(value)
    return filtered_values

# Function to filter 'standardized_values' by language_tag (or accept if no language_tag)
def get_filtered_standardized_values(color_entries):
    filtered_values = []
    for entry in color_entries:
        language_tag = entry.get('language_tag')
        if language_tag is None or language_tag in ['en_IN', 'en_US']:
            std_values = entry.get('standardized_values', [])
            filtered_values.extend(std_values)
    return filtered_values

# Process each JSON file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.json'):
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, filename.replace('.json', '.csv'))

        print(f"Processing {input_file} → {output_file}")

        # Load line-delimited JSON
        records = []
        with open(input_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    records.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Failed to load a line in {filename}: {e}")

        required_keys = ['brand', 'bullet_point', 'color', 'model_name', 'item_name', 
                         'product_type', 'main_image_id', 'item_keywords', 'country']

        filtered_records = [
            record for record in records
            if all(key in record for key in required_keys)
            and record.get('country') in ['IN', 'US']
            # and 'item_dimensions' not in record
        ]

        print(f" → Total matching records: {len(filtered_records)}")

        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(header) 

            for record in filtered_records:
                overall_description = get_filtered_values(record.get('bullet_point', []))
                colour_description = []
                colour_description.extend(get_filtered_standardized_values(record.get('color', [])))
                colour_description.extend(get_filtered_values(record.get('color', [])))
                other_description = []
                for field in ['product_type', 'item_keywords']:
                    other_description.extend(get_filtered_values(record.get(field, [])))
                material_description = []
                if 'material' in record:
                    material_description.extend(get_filtered_values(record.get('material', [])))

                row = [
                    record.get('main_image_id'),
                    '; '.join(overall_description),
                    '; '.join(colour_description),
                    '; '.join(other_description),
                    '; '.join(material_description)
                ]

                writer.writerow(row)

        print(f" → Saved {len(filtered_records)} records to {output_file}\n")


Processing abo-listings/listings/extracted_metadata/listings_f.json → abo-listings/listings/filtered_metadata/listings_f.csv
 → Total matching records: 4255
 → Saved 4255 records to abo-listings/listings/filtered_metadata/listings_f.csv

Processing abo-listings/listings/extracted_metadata/listings_1.json → abo-listings/listings/filtered_metadata/listings_1.csv
 → Total matching records: 4208
 → Saved 4208 records to abo-listings/listings/filtered_metadata/listings_1.csv

Processing abo-listings/listings/extracted_metadata/listings_0.json → abo-listings/listings/filtered_metadata/listings_0.csv
 → Total matching records: 4258
 → Saved 4258 records to abo-listings/listings/filtered_metadata/listings_0.csv

Processing abo-listings/listings/extracted_metadata/listings_7.json → abo-listings/listings/filtered_metadata/listings_7.csv
 → Total matching records: 4166
 → Saved 4166 records to abo-listings/listings/filtered_metadata/listings_7.csv

Processing abo-listings/listings/extracted_metad

In [None]:
# !pip install --upgrade pip
# !pip install google-genai



In [None]:
# from google import genai
# from google.genai import Client, types
# import time

# print(dir(genai))
# print(dir(Client))
# print(dir(types))

['Client', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_api_client', '_api_module', '_base_url', '_common', '_extra_utils', '_live_converters', '_replay_api_client', '_transformers', 'batches', 'caches', 'chats', 'client', 'errors', 'files', 'live', 'models', 'operations', 'pagers', 'tunings', 'types', 'version']
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_get_api_client', 'aio', 'batches', 'caches', 'chats', 'files', 'models', 'operations', 'tunings', 'vertexai']
['ActivityEnd', 'ActivityEndDict', 'ActivityEndOrDict', 'ActivityHandling', 'ActivityStart', 'ActivityStartDict', 'ActivityStar

### Script for Generating Image-Based Questions using Gemini API





#### This script processes listings from a CSV file, retrieves associated images, and queries the Gemini API to generate image-based questions. The generated questions and answers are then saved into a CSV file, with progress tracked to resume from the last processed record if interrupted. Rate limits and daily request constraints are respected to avoid hitting API limits.


In [None]:

client = genai.Client(api_key="put_the_api_key")

# Set daily limits
MAX_DAILY_REQUESTS = 1500
DELAY_BETWEEN_REQUESTS = 60 

requests_made = 0

def load_progress(progress_file):
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            index = f.read().strip()
            return int(index)
    return 0

def save_progress(progress_file, index):
    with open(progress_file, 'w') as f:
        f.write(str(index))

def query_gemini_api(image_bytes, combined_description):
    
    # this prompt was used for generating the 5 question-single answer pairs for each image
    prompt_text = (
        "You are given an image and a brief product description.\n"
        f"Use the product description context: {combined_description}\n"
        "Generate exactly 5 diverse, visually clear, and progressively challenging questions.\n"
        "Each question must be answerable by only looking at the image — do NOT rely on external or assumed knowledge.\n"
        "Ensure variation in the *type* of visual cues used: color, shape, count, spatial relationship, relative size, and visible text (if any).\n"
        "Ensure variation in *difficulty level*:\n"
        "- At least 2 simple questions (e.g., color, count)\n"
        "- At least 2 moderately difficult questions (e.g., spatial relations, comparisons)\n"
        "- 1 challenging question requiring closer inspection or subtle visual reasoning (e.g., most prominent item, inferred use from shape)\n"
        "Do NOT ask about materials or properties that are not visually obvious (e.g., plastic, flexible, metal).\n"
        "Answers must be a *single word* — not 'yes' or 'no' unless absolutely necessary.\n"
        "Strictly use this format without extra text:\n"
        "Question 1: <question>\n"
        "Answer 1: <answer>\n"
        "Question 2: <question>\n"
        "Answer 2: <answer>\n"
        "Question 3: <question>\n"
        "Answer 3: <answer>\n"
        "Question 4: <question>\n"
        "Answer 4: <answer>\n"
        "Question 5: <question>\n"
        "Answer 5: <answer>"
    )

    try:
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=[
                types.Part.from_bytes(
                    data=image_bytes,
                    mime_type='image/jpeg'
                ),
                prompt_text
            ]
        )
        return response.text
    except Exception as e:
        print(f"Error querying Gemini API: {e}")
        return None

def process_records(listings_csv_path, images_csv_path, images_base_path, output_file, progress_file):
    global requests_made

    image_path_map = {}
    with open(images_csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            image_path_map[row['image_id']] = row['path']

    print("Loaded image metadata successfully.")

    output_dir = os.path.dirname(output_file)
    os.makedirs(output_dir, exist_ok=True)

    # Load last processed index
    start_index = load_progress(progress_file)
    current_index = 0

    with open(output_file, 'a', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        if os.stat(output_file).st_size == 0:
            writer.writerow(['image_id', 'full_image_path', 'question', 'answer'])

        with open(listings_csv_path, 'r', encoding='utf-8') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if current_index < start_index:
                    current_index += 1
                    continue  # skip already processed

                if requests_made >= MAX_DAILY_REQUESTS:
                    print("Reached daily request limit. Stopping.")
                    break

                image_id = row['main_image_id']
                image_filename = image_path_map.get(image_id)

                if not image_filename:
                    print(f"Image path not found for image_id: {image_id}")
                    current_index += 1
                    continue

                full_image_path = os.path.join(images_base_path, image_filename)

                if not os.path.exists(full_image_path):
                    print(f"Image file does not exist: {full_image_path}")
                    current_index += 1
                    continue

                try:
                    with open(full_image_path, "rb") as img_file:
                        image_bytes = img_file.read()
                except Exception as e:
                    print(f"Failed to read image {full_image_path}: {e}")
                    current_index += 1
                    continue

                combined_description = f"Overall: {row['overall_description']}; " \
                                       f"Color: {row['colour_description']}; " \
                                       f"Material: {row['material_description']}"\
                                       f"Other: {row['other_description']}; " \

                print(f"Sending request for image_id: {image_id}")

                generated_text = query_gemini_api(image_bytes, combined_description)

                if generated_text:
                    lines = [line.strip() for line in generated_text.strip().split('\n') if line.strip()]
                    question_lines = [line for line in lines if line.lower().startswith('question')]
                    answer_lines = [line for line in lines if line.lower().startswith('answer')]

                    if len(question_lines) == 5 and len(answer_lines) == 5:
                        for q_line, a_line in zip(question_lines, answer_lines):
                            question = q_line.split(':', 1)[1].strip()
                            answer = a_line.split(':', 1)[1].strip()
                            writer.writerow([image_id, full_image_path, question, answer])
                            f_out.flush()
                        print(f"Processed image_id: {image_id}")
                    else:
                        print(f"Unexpected format or count in response for image_id: {image_id}")
                else:
                    print(f"Failed to generate questions for image_id: {image_id}")

                requests_made += 1
                current_index += 1
                save_progress(progress_file, current_index)

                if requests_made < MAX_DAILY_REQUESTS:
                    print(f"Sleeping {DELAY_BETWEEN_REQUESTS} seconds to respect rate limits...")
                    time.sleep(DELAY_BETWEEN_REQUESTS)

import os

# CONFIGURATION
current_working_filename = 'listings_a'
question_set_number = 'set_2'


listings_csv_path = f'abo-listings/listings/filtered_metadata/{current_working_filename}.csv'
images_csv_path = 'abo-images-small/images/metadata/images.csv'
images_base_path = 'abo-images-small/images/small'

output_dir = 'generated_questions'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f'questions_{current_working_filename}_{question_set_number}.csv')

progress_dir = 'progress'
os.makedirs(progress_dir, exist_ok=True)
progress_file = os.path.join(progress_dir, f'progress_{current_working_filename}.txt')


process_records(listings_csv_path, images_csv_path, images_base_path, output_file, progress_file)

Loaded image metadata successfully.
Sending request for image_id: 613sLyZDo8L
Processed image_id: 613sLyZDo8L
Sleeping 60 seconds to respect rate limits...
Sending request for image_id: 612Ze0tLBSL
Processed image_id: 612Ze0tLBSL
Sleeping 60 seconds to respect rate limits...
Sending request for image_id: 71Tj6P5afdL
Processed image_id: 71Tj6P5afdL
Sleeping 60 seconds to respect rate limits...
Sending request for image_id: 61cI1UAN+8L
Processed image_id: 61cI1UAN+8L
Sleeping 60 seconds to respect rate limits...
Sending request for image_id: 715OrnyoT0L
Processed image_id: 715OrnyoT0L
Sleeping 60 seconds to respect rate limits...
Sending request for image_id: 711cFmiF4QL
Processed image_id: 711cFmiF4QL
Sleeping 60 seconds to respect rate limits...
Sending request for image_id: 715r81595cL
Processed image_id: 715r81595cL
Sleeping 60 seconds to respect rate limits...
Sending request for image_id: 71v+zZWohsL
Processed image_id: 71v+zZWohsL
Sleeping 60 seconds to respect rate limits...
Send

KeyboardInterrupt: 