# üñºÔ∏è LLaVA Multimodal Instruction Tuning Data Construction Pipeline
This Notebook integrates the complete workflow from **COCO Data Alignment** to **Automated Annotation using Multimodal Large Language Models (Qwen-VL)**. It is suitable for building training datasets for models like LLaVA and Qwen-VL.

### 1. Environment Preparation
Install necessary dependencies.

In [None]:
pip install opencv-python tqdm openai numpy

### 2. Phase 1: Precision Coordinate Alignment (COCO Format to LLaVA)
Convert standard COCO detection boxes `[x, y, w, h]` into LLaVA-formatted normalized coordinates `[ymin, xmin, ymax, xmax]` (0-1000).

In [None]:
import os, json, glob, cv2
from tqdm.notebook import tqdm

IMAGE_DIR = "../data/images/"
ANNOTATION_FILE = "../data/annotations/instances_val2017.json"
OUTPUT_ALIGN_FILE = "../data/llava_instruct_aligned.json"

def convert_bbox(bbox, width, height):
    x, y, w, h = bbox
    xmin = int((x / width) * 1000)
    ymin = int((y / height) * 1000)
    xmax = int((x + w) / width * 1000)
    ymax = int((y + h) / height * 1000)
    return [max(0, min(1000, ymin)), max(0, min(1000, xmin)), 
            max(0, min(1000, ymax)), max(0, min(1000, xmax))]

def load_coco():
    with open(ANNOTATION_FILE, 'r') as f: coco = json.load(f)
    img_to_anns = {}
    cat_map = {cat['id']: cat['name'] for cat in coco['categories']}
    for ann in coco['annotations']:
        img_id = ann['image_id']
        if img_id not in img_to_anns: img_to_anns[img_id] = []
        img_to_anns[img_id].append({"bbox": ann['bbox'], "label": cat_map.get(ann['category_id'], "object")})
    return img_to_anns

if os.path.exists(ANNOTATION_FILE):
    img_to_anns = load_coco()
    dataset = []
    image_paths = glob.glob(os.path.join(IMAGE_DIR, "*.jpg"))
    
    for img_path in tqdm(image_paths, desc="Aligning Data"):
        fname = os.path.basename(img_path)
        try: image_id = int(fname.split('.')[0])
        except: continue
        anns = img_to_anns.get(image_id, [])
        img = cv2.imread(img_path)
        if img is None: continue
        h, w, _ = img.shape
        for ann in anns[:3]: # Extract 3 objects per image
            box = convert_bbox(ann['bbox'], w, h)
            dataset.append({
                "id": f"{image_id}_{ann['label']}",
                "image": fname,
                "conversations": [
                    {"from": "human", "value": f"Where is the {ann['label']}? <image>"},
                    {"from": "qwen", "value": f"The {ann['label']} is at {box}."}
                ]
            })
    with open(OUTPUT_ALIGN_FILE, 'w') as f: json.dump(dataset, f, indent=2)
    print(f"Alignment complete, saved to {OUTPUT_ALIGN_FILE}")

### 3. Phase 2: LLM Enhanced Annotation (Cognitive Data Generation)
Leverage large models like Qwen2.5-VL to generate dialogues containing reasoning and detailed descriptions.

In [None]:
import base64
from openai import OpenAI

API_KEY = "YOUR_API_KEY"
client = OpenAI(api_key=API_KEY, base_url="https://api.siliconflow.cn/v1")

def encode_img(p): 
    with open(p, "rb") as f: return base64.b64encode(f.read()).decode('utf-8')

def generate_llava_json(img_path):
    b64 = encode_img(img_path)
    response = client.chat.completions.create(
        model="Qwen/Qwen2.5-VL-72B-Instruct",
        messages=[{"role": "system", "content": "Return JSON directly, including two rounds of dialogue: describing the image and detailed questioning. Use the [ymin, xmin, ymax, xmax] format."},
                  {"role": "user", "content": [{"type":"text","text":"Analyze this image"}, 
                                               {"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{b64}"}}]}]
    )
    return response.choices[0].message.content

### 4. Phase 3: Interleaved Image Data
Construct dialogues with interleaved input of two images to train the model's comparative and logical reasoning capabilities.

In [None]:
import random
def generate_interleaved(img1, img2):
    prompt = "Compare these two images."
    # Call the model and construct LLaVA Interleaved format...
    # Example result below:
    return {
        "id": "compare_001",
        "image": [os.path.basename(img1), os.path.basename(img2)],
        "conversations": [
            {"from": "human", "value": "Image 1: <image>\nImage 2: <image>\nWhat is the difference?"},
            {"from": "qwen", "value": "The first image is a city, the second is a park."}
        ]
    }

### 5. Phase 4: Debug Visualization
Read the generated JSON file and project the bounding boxes back onto the images to verify the precision of the LLM-generated coordinates.

In [None]:
import re
VIZ_OUT = "../data/viz_debug/"
os.makedirs(VIZ_OUT, exist_ok=True)

with open(OUTPUT_ALIGN_FILE, 'r') as f: data = json.load(f)

for entry in data[:5]: # Sample 5 images for verification
    img = cv2.imread(os.path.join(IMAGE_DIR, entry['image']))
    h, w, _ = img.shape
    for turn in entry['conversations']:
        bboxes = re.findall(r'\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]', turn['value'])
        for b in bboxes:
            ymin, xmin, ymax, xmax = [int(x) for x in b]
            cv2.rectangle(img, (int(xmin*w/1000), int(ymin*h/1000)), 
                          (int(xmax*w/1000), int(ymax*h/1000)), (0, 255, 0), 2)
    cv2.imwrite(os.path.join(VIZ_OUT, f"viz_{entry['image']}"), img)
print(f"Verification images saved to {VIZ_OUT}")