In [1]:
from datasets import load_dataset
import os
import json
import requests
from PIL import Image
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from huggingface_hub import HfApi, login

HF_TOKEN = "XXXX"
login(HF_TOKEN)

In [3]:
import zipfile
import os

def zip_img_folder(
    folder_to_zip: str = "data/spatialthinker_vqa_10k_data",
    zip_filename: str = "data/spatialthinker_vqa_10k_data.zip",
):
    with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_to_zip):
            for file in files:
                full_path = os.path.join(root, file)
                arcname = os.path.relpath(full_path, folder_to_zip)
                zipf.write(full_path, arcname)

    print(f"✅ Zipped {folder_to_zip} → {zip_filename}")

In [4]:
image_dir = "data/spatialthinker_vqa_10k_data"
os.makedirs(image_dir, exist_ok=True)

In [5]:
# prepare json in ShareGPT format

def generate_sharegpt(
    dataset,
    image_dir = "data/spatialthinker_vqa_10k_data",
    json_dir = "data/spatialthinker_vqa_10k.json",
    save_images = True,
    problem_key = "question_with_options",
    answer_key = "answer_option_text_only",
    image_id_key = "image_id",
    image_key = "images",
    target_repo = "hunarbatra/spatialthinker_vqa_10k_filtered_sharegpt",
    upload = False,
):
    sharegpt_data = []
    
    for i, example in tqdm(enumerate(dataset), total=len(dataset)):
        curr_image_id = example[image_id_key]
        local_image_path = f"{image_dir}/{curr_image_id}.jpg"
        data_image_path = f"{image_dir.split('/')[-1]}/{curr_image_id}.jpg"
        
        # convert image to RGP before saving as JPEG
        if save_images:
            image = example[image_key].convert("RGB")
            image.save(local_image_path)
        
        # add to ShareGPT format data
        entry = {
            "messages": [
                {
                    "content": "<image>" + example[problem_key],
                    "role": "user"
                },
                {
                    "content": example[answer_key],
                    "role": "assistant"
                }
            ],
            "images": [
                data_image_path
            ]
        }
        sharegpt_data.append(entry)
        
    with open(json_dir, "w") as f:
        json.dump(sharegpt_data, f, indent=2)
        
    if upload:
        # create empty huggingface repo for target_repo if it does not exist
        HfApi().create_repo(
            repo_id=target_repo,
            repo_type="dataset",
            private=True,
            exist_ok=True
        )
            
        HfApi().upload_file(
            path_or_fileobj=json_dir,           # local file path
            path_in_repo=json_dir.split("/")[-1],                # where it should appear in repo
            repo_id=target_repo,               # your dataset repo
            repo_type="dataset"
        )
        print(f'Uploaded {json_dir} to {target_repo}')
        
        zip_img_folder(
            folder_to_zip=image_dir,
            zip_filename=f"{image_dir}.zip"
        )
        
        img_dir_tail = image_dir.split("/")[-1]
        HfApi().upload_file(
            path_or_fileobj=f"{image_dir}.zip",
            path_in_repo=f"{img_dir_tail}.zip",
            repo_id=target_repo,
            repo_type="dataset"
        )
        print(f'Uploaded {image_dir}.zip to {target_repo}')
        
        

In [6]:
dataset = load_dataset("hunarbatra/spatialthinker_vqa_10k_filtered", split="train")

generate_sharegpt(
    dataset,
    image_dir="data/spatialthinker_vqa_10k_data",
    json_dir="data/spatialthinker_vqa_10k.json",
    save_images=True,
    problem_key="question_with_options",
    answer_key="answer_option_text_only",
    image_id_key="image_id",
    image_key="images",
    target_repo="hunarbatra/spatialthinker_vqa_10k_filtered_sharegpt",
    upload=True
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 6895/6895 [00:00<00:00, 11551.73 examples/s]
Generating val split: 100%|██████████| 692/692 [00:00<00:00, 12986.04 examples/s]
100%|██████████| 6895/6895 [00:18<00:00, 382.04it/s]


Uploaded data/spatialthinker_vqa_10k.json to hunarbatra/spatialthinker_vqa_10k_filtered_sharegpt
✅ Zipped data/spatialthinker_vqa_10k_data → data/spatialthinker_vqa_10k_data.zip


spatialthinker_vqa_10k_data.zip: 100%|██████████| 411M/411M [00:13<00:00, 31.5MB/s] 


Uploaded data/spatialthinker_vqa_10k_data.zip to hunarbatra/spatialthinker_vqa_10k_filtered_sharegpt


In [None]:
# dataset = load_dataset("hunarbatra/spatialthinker_vqa_10k", split="train")

# generate_sharegpt(
#     dataset,
#     image_dir="data/spatialthinker_vqa_10k_data",
#     json_dir="data/spatialthinker_vqa_10k.json",
#     save_images=True,
#     problem_key="question_with_options",
#     answer_key="answer_option_text_only",
#     image_id_key="image_id",
#     image_key="images",
#     target_repo="hunarbatra/spatialthinker_vqa_10k_sharegpt"
# )