In [1]:
import os
import pandas as pd
import io
from PIL import Image
import json

# Paths
parquet_path = "/home/lmoukheiber3/OpenThinkIMG/OpenThinkIMG-Chart-Test-994/data/train-00000-of-00001.parquet"
image_dir = "/home/lmoukheiber3/OpenThinkIMG/tool_server/tf_eval/tasks/chartgemma/images/"
json_out = "/home/lmoukheiber3/OpenThinkIMG/tool_server/tf_eval/tasks/chartgemma/metadata.json"

# Create images folder if it doesn't exist
os.makedirs(image_dir, exist_ok=True)

# Load the parquet
df = pd.read_parquet(parquet_path)

# Prepare JSON list
json_list = []

for idx, row in df.iterrows():
    # Extract image bytes
    img_bytes = row["image"]["bytes"]
    image = Image.open(io.BytesIO(img_bytes))

    # Save image with index matching row
    img_filename = f"image_{idx:05d}.png"
    img_path = os.path.join(image_dir, img_filename)
    image.save(img_path)

    # Convert label to string if list
    label_value = row["label"]
    if isinstance(label_value, list):
        label_value = ", ".join(map(str, label_value))

    # Add JSON entry
    json_list.append({
        "image_path": img_path,
        "question": row["question"],
        "label": f"<answer> {label_value} </answer>"
    })

# Save JSON
with open(json_out, "w") as f:
    json.dump(json_list, f, indent=4)

print(f"✅ Done. Images saved to {image_dir}, JSON saved to {json_out}")


✅ Done. Images saved to /home/lmoukheiber3/OpenThinkIMG/tool_server/tf_eval/tasks/chartgemma/images/, JSON saved to /home/lmoukheiber3/OpenThinkIMG/tool_server/tf_eval/tasks/chartgemma/metadata.json
