In [None]:
import requests
from PIL import Image

import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
).to(0)

processor = AutoProcessor.from_pretrained(model_id)

# Define a chat history and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "What are these?"},
          {"type": "image"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(images=raw_image, text=prompt, return_tensors='pt').to(0, torch.float16)

output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
print(processor.decode(output[0][2:], skip_special_tokens=True))


## 拼貼

In [None]:
import os
import math
from PIL import Image
import matplotlib.pyplot as plt

import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration


##########################
# 1) Initialize LLaVA
##########################
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True
).cuda()

processor = AutoProcessor.from_pretrained(model_id)


##########################
# 2) Collage creation function
##########################
def create_collage(image_paths, grid_cols=3):
    """
    Read all images in image_paths and create a collage with 'grid_cols' columns.
    Returns a single PIL Image.
    """
    if not image_paths:
        return None

    images = [Image.open(p) for p in image_paths]
    num_images = len(images)
    rows = math.ceil(num_images / grid_cols)
    cols = grid_cols

    max_width = max(img.width for img in images)
    max_height = max(img.height for img in images)

    collage_width = cols * max_width
    collage_height = rows * max_height
    collage = Image.new("RGB", (collage_width, collage_height), (255, 255, 255))

    for i, img in enumerate(images):
        row = i // cols
        col = i % cols
        x = col * max_width
        y = row * max_height
        collage.paste(img, (x, y))

    return collage


##########################
# 3) Process a single object folder
##########################
def process_object_folder(obj_folder_path):
    """
    Collect view_*.png images, create collage, display, and use LLaVA to describe the shape and structure.
    """
    # 1) Collect image paths
    image_paths = []
    for file_name in os.listdir(obj_folder_path):
        if file_name.startswith("view_") and file_name.endswith(".png"):
            image_paths.append(os.path.join(obj_folder_path, file_name))

    image_paths.sort()
    if not image_paths:
        return

    # 2) Create collage image
    collage_img = create_collage(image_paths, grid_cols=3)

    # 3) Display the image
    plt.imshow(collage_img)
    plt.axis("off")
    plt.show()

    # 4) Prepare conversation prompt (corrected version)
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": (
                        "Ignore how the image is constructed or where it comes from. "
                        "Do not mention 'the image', 'multiple views', or 'viewpoints'. "
                        "Describe only the **object itself**, including its shape, structure, and geometry. "
                        "Focus on parts, form, and layout. Do not mention colors, materials, or how the object is presented."
                    )
                },
                {
                    "type": "image"
                },
            ],
        },
    ]


    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    # 5) Encode inputs
    inputs = processor(
        images=collage_img,
        text=prompt,
        return_tensors='pt'
    ).to("cuda", torch.float16)

    # 6) Generate description
    output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
    llava_answer = processor.decode(output[0][2:], skip_special_tokens=True)

    # 7) Print result
    print("\n🔎 LLaVA shape/structure description:\n")
    print(llava_answer)
    print("\n✅ Done.")


##########################
# 4) Main function
##########################
def main():
    # Replace this with your own dataset root path
    root_dir = "/home/klooom/cheng/3d_retrival/MHSAN/modelnet40-princeton-3d-object-dataset/rendered_views_12"

    finished = False

    for category_name in os.listdir(root_dir):
        category_path = os.path.join(root_dir, category_name)
        if not os.path.isdir(category_path):
            continue

        for split in ["train", "test"]:
            split_path = os.path.join(category_path, split)
            if not os.path.isdir(split_path):
                continue

            for obj_folder in os.listdir(split_path):
                obj_folder_path = os.path.join(split_path, obj_folder)
                if not os.path.isdir(obj_folder_path):
                    continue

                process_object_folder(obj_folder_path)
                finished = True
                break

            if finished:
                break
        if finished:
            break


if __name__ == "__main__":
    main()


In [None]:
import os
import math
import json
from PIL import Image
import matplotlib.pyplot as plt

import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

# 1. 初始化 LLaVA 模型與處理器
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
).cuda()
processor = AutoProcessor.from_pretrained(model_id)


# 2. 建立拼貼圖函式
def create_collage(image_paths, grid_cols=3):
    images = [Image.open(p) for p in image_paths]
    num_images = len(images)
    rows = math.ceil(num_images / grid_cols)
    cols = grid_cols

    max_width = max(img.width for img in images)
    max_height = max(img.height for img in images)

    collage = Image.new("RGB", (cols * max_width, rows * max_height), (255, 255, 255))

    for i, img in enumerate(images):
        row = i // cols
        col = i % cols
        x = col * max_width
        y = row * max_height
        collage.paste(img, (x, y))

    return collage


# 3. 單一物件處理
def process_object_folder(obj_folder_path, category, split, results_file):
    image_paths = [
        os.path.join(obj_folder_path, f)
        for f in sorted(os.listdir(obj_folder_path))
        if f.startswith("view_") and f.endswith(".png")
    ]
    if not image_paths:
        return

    collage_img = create_collage(image_paths, grid_cols=4)

    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": (
                        f"You will be shown an image. It contains multiple views of the same single object "
                        f"from the **{category}** category. Please remember that it is only one object seen from different angles."
                    )
                },
                {
                    "type": "image"
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": (
                        "Now, based on what you saw, describe the object’s **shape and structure**. "
                        "Focus on its geometry, parts, and how they are arranged. "
                        "Do not mention anything about the image, how many views there are, colors, or materials."
                    )
                }
            ]
        }
    ]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(
        images=collage_img,
        text=prompt,
        return_tensors='pt'
    ).to("cuda", torch.float16)

    output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
    raw_output = processor.decode(output[0][2:], skip_special_tokens=True).strip()

    # 移除開頭的 prompt 內容與 "ASSISTANT:" 前綴
    if "ASSISTANT:" in raw_output:
        description = raw_output.split("ASSISTANT:")[1].strip()
    else:
        description = raw_output  # 萬一沒有標記就直接用整段

    # 1) 寫入每個資料夾的 description.txt
    desc_path = os.path.join(obj_folder_path, "description.txt")
    with open(desc_path, "w", encoding="utf-8") as f:
        f.write(description)

    # 2) 寫入 results.jsonl
    result = {
        "category": category,
        "split": split,
        "object_id": os.path.basename(obj_folder_path),
        "description": description
    }
    with open(results_file, "a", encoding="utf-8") as f:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

    print(f" {category}/{split}/{os.path.basename(obj_folder_path)}")


# 4. 主流程：遍歷所有資料夾
def main():
    root_dir = "/home/klooom/cheng/3d_retrival/MHSAN/modelnet40-princeton-3d-object-dataset/rendered_views_12"
    results_file = "results.jsonl"

    # 若存在舊檔,先清空
    if os.path.exists(results_file):
        os.remove(results_file)

    for category in sorted(os.listdir(root_dir)):
        category_path = os.path.join(root_dir, category)
        if not os.path.isdir(category_path):
            continue

        for split in ["train", "test"]:
            split_path = os.path.join(category_path, split)
            if not os.path.isdir(split_path):
                continue

            for obj_folder in sorted(os.listdir(split_path)):
                obj_folder_path = os.path.join(split_path, obj_folder)
                if not os.path.isdir(obj_folder_path):
                    continue

                try:
                    process_object_folder(obj_folder_path, category, split, results_file)
                except Exception as e:
                    print(f" Failed: {category}/{split}/{obj_folder} — {e}")


if __name__ == "__main__":
    main()


In [None]:
import json
import re
from collections import Counter

def build_vocab_from_jsonl(results_file, min_freq=2, max_size=10000):
    word_counter = Counter()

    with open(results_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            description = data.get("description", "")
            # 1) 做簡易分詞 
            tokens = re.findall(r"\w+", description.lower()) 
            # 2) 更新詞頻
            word_counter.update(tokens)

    # 3) 按詞頻排序
    sorted_tokens = sorted(word_counter.items(), key=lambda x: x[1], reverse=True)

    # 4) 過濾稀有 & vocab 限制
    filtered_tokens = [t for t, c in sorted_tokens if c >= min_freq]
    filtered_tokens = filtered_tokens[: (max_size - 2)]  # 給特殊token留位

    # 5) 建立 token2id
    vocab = {"<pad>": 0, "<unk>": 1}
    for token in filtered_tokens:
        vocab[token] = len(vocab)

    print(f"Vocabulary size = {len(vocab)} (including special tokens)")
    return vocab

# 用法範例
results_file = "results.jsonl"  # 你的路徑
vocab = build_vocab_from_jsonl(results_file, min_freq=2, max_size=10000)

# vocab 會是一個 dict,比如：
# {"<pad>":0, "<unk>":1, "the":2, "structure":3, ...}


In [1]:
import json

def gather_descriptions(results_file, output_txt):
    with open(results_file, "r", encoding="utf-8") as fin, \
         open(output_txt, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            desc = data.get("description", "")
           
            desc = desc.strip()
            if desc:
                fout.write(desc + "\n")

results_file = "results.jsonl"
text_corpus = "descriptions.txt"
gather_descriptions(results_file, text_corpus)

In [3]:
from tokenizers import BertWordPieceTokenizer
import os


def train_subword_tokenizer(
    text_file,
    vocab_size=3000,
    output_dir="tokenizer_model",
    min_frequency=2
):
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=True,  # 若有中文,可視需求切換
        strip_accents=True,
        lowercase=True
    )
    os.makedirs(output_dir, exist_ok=True)
    # 訓練
    tokenizer.train(
        files=[text_file],
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        limit_alphabet=1000,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    )
    
    # 儲存檔案 (會輸出 vocab.txt 等檔案)
    tokenizer.save_model(output_dir)
    print(f"Tokenizer saved to {output_dir}/")


train_subword_tokenizer(
    text_file=text_corpus,
    vocab_size=3000,
    output_dir="tokenizer_model"
)




Tokenizer saved to tokenizer_model/
