# Proving exp

* logit mean, max, min, prob


In [1]:
import sys 
sys.path.append('/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu')
import argparse
import os
import random
from typing import List, Union, Optional, Dict, Tuple
import gc
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
from typing import Dict, Tuple, List

# import wandb

from transformers import AutoProcessor, LlavaForConditionalGeneration, set_seed  # noqa: F401

from src.model_zoo import get_model
from src.dataset_zoo import get_dataset
from src.misc import seed_all, _default_collate, save_scores
from src.old.probing_utils_copy import load_llava

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET = "VizWiz"
TRAIN_PATH = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/preprocess/llava-1.5-7b-hf-vizwiz_train-llava_answers.csv"
VAL_PATH = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/data/preprocess/llava-1.5-7b-hf-vizwiz_val-llava_answers.csv"
SPLIT = "train" # or val

NUM_WORKERS = 16
 
SEED = 1
seed_all(SEED)

In [57]:
from torch.utils.data import Dataset
# image path, question, 

class Vizwiz(Dataset):
    def __init__(
        self,
        data_path,
        subset_size=500,
        start_idx=0,
        ):
        # image path / question / gold_answer / model_answer / label
        # label 0 -> 정답 (no hallucination) / label 1 -> 오답 (hallucination)  
        data_cv = pd.read_csv(data_path)
        if start_idx > 0:
            self.image_paths = data_cv["image_path"].tolist()[start_idx:]
            self.questions = data_cv["question"].tolist()[start_idx:]
            self.gold_answers = data_cv["gold_answer"].tolist()[start_idx:]
            self.hallu_labels = data_cv["label"].tolist()[start_idx:]
        else:
            if subset_size > 0:
                subset_indices = np.random.choice(len(data_cv), subset_size, replace=False)
                final_data = data_cv.iloc[subset_indices].reset_index(drop=True)
            else:
                final_data = data_cv
            
            self.image_paths = final_data["image_path"].tolist()
            self.questions = final_data["question"].tolist()
            self.gold_answers = final_data["gold_answer"].tolist()
            self.hallu_labels = final_data["label"].tolist()
        

    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx: int) :        
        image_path = self.image_paths[idx]
        question = self.questions[idx]
        gold_answer = self.gold_answers[idx]
        hallu_label = self.hallu_labels[idx]
        
        return idx, image_path, question, gold_answer, hallu_label
    
    
    
def viz_collate_fn(batch):
    idxs, images, questions, gold_answers, labels, image_paths = [], [], [], [], [], []
    
    for idx, image_path, question, gold_answer, hallu_label in batch:
        try:
            img = Image.open(image_path).convert("RGB")
        except Exception:
            img = Image.new("RGB", (image_size, image_size), (0, 0, 0))
        
        images.append(img)
        questions.append(question)
        gold_answers.append(gold_answer)
        labels.append(int(hallu_label))
        image_paths.append(image_path)
        idxs.append(idx)
        
    return (idxs, images, questions, gold_answers, labels, image_paths)

In [4]:
from transformers import LlavaForConditionalGeneration, Qwen2VLForConditionalGeneration, InstructBlipProcessor, InstructBlipForConditionalGeneration

def load_model(model_name):
    
    if torch.cuda.is_available():
        cap_major = torch.cuda.get_device_capability(0)[0]  # compute capability of gpu 0
        dtype = torch.bfloat16 if cap_major >= 8 else torch.float16
        device_map = "auto"
    else:
        dtype = torch.float32
        device_map = None
    
    if "llava" in model_name:
        model_id = "llava-hf/llava-1.5-7b-hf"
        processor = AutoProcessor.from_pretrained(
            model_id,
            trust_remote_code=False,
            cache_dir='/data3/hg_weight/hg_weight',
            use_fast=False
        )
        model = LlavaForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
            device_map=device_map,
            cache_dir='/data3/hg_weight/hg_weight',
        )    
        tok = processor.tokenizer
        tok.padding_side = "left"
        
    elif 'qwen' in model_name:
        model_id = "Qwen/Qwen2-VL-7B-Instruct"
        device_map = "cuda"
        processor = AutoProcessor.from_pretrained(
            model_id,
            trust_remote_code=False,
            cache_dir='/data3/hg_weight/hg_weight',
            use_fast=False
        )
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
            device_map=device_map,
            cache_dir="/data3/hg_weight/hg_weight",
        )
        tok = processor.tokenizer
        tok.padding_side = "left"
        if tok.pad_token_id is None and tok.eos_token_id is not None:
            tok.pad_token = tok.eos_token       
        model.generation_config.pad_token_id = tok.pad_token_id
        model.generation_config.do_sample = False
        # model.generation_config.top_p = 1

        
    elif 'instructblip' in model_name:
        model_id = 'Salesforce/instructblip-vicuna-7b'
        processor = InstructBlipProcessor.from_pretrained(
            model_id,
            trust_remote_code=False,
            cache_dir='/data3/hg_weight/hg_weight',
            use_fast=False
        )
        model = InstructBlipForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
            device_map=device_map,
            cache_dir='/data3/hg_weight/hg_weight'
        )
        tok = processor.tokenizer
        tok.padding_side = "left"
        
    else:
        print("The model should be one of the following: llava1.5-7b, qwen2-vl-7b, instructblip")
        return None
    
    return model, processor, tok

In [None]:
def get_num_layers_from_config(model) -> int:
    cfg = getattr(model, "config", None)
    if hasattr(cfg, "text_config") and hasattr(cfg.text_config, "num_hidden_layers"):
        return int(cfg.text_config.num_hidden_layers)
    if hasattr(cfg, "num_hidden_layers"):
        return int(cfg.num_hidden_layers)
    raise ValueError("Cannot find num_hidden_layers in config.")




In [5]:
def load_dataset(split, batch_size, subset_size = 500, start_idx=0):
    data_path = TRAIN_PATH if split == 'train' else VAL_PATH
    dataset = Vizwiz(data_path, subset_size=subset_size, start_idx=start_idx)

    joint_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, collate_fn=viz_collate_fn)

    return joint_loader

In [6]:
# load prompt
def build_prompt(tokenizer, question, model_type) -> str:
    if ("llava" in model_type):
        content = [{"type": "image"}, {"type": "text", "text": question+'Answer in one word.'}]
        if hasattr(tokenizer, "apply_chat_template"):
            messages = [{"role": "user", "content": content}]
            try:
                prompt = tokenizer.apply_chat_template(
                    messages,
                    add_generation_prompt=True,
                    tokenize=False
                )
                return prompt
            except Exception:
                pass

        return "<image>\n" + question.strip() + "\n"
    elif "qwen" in model_type:
        content = [{"type": "image"}, {"type": "text", "text": question+'Answer in one word.'}]
        if hasattr(tokenizer, "apply_chat_template"):
            # print("########## Qwen tokenizer has chat_template attr")
            messages = [{"role": "user", "content": content}]
            try:
                prompt = tokenizer.apply_chat_template(
                    messages,
                    add_generation_prompt=True,
                    tokenize=False
                )
                return prompt
            except Exception:
                pass

        return "<image>\n" + question.strip() + "\n"
    

    elif "instructblip" in model_type:
        prompt = question + "Answer in one word."
        return prompt

In [7]:
import math
from PIL import Image

def fix_tiny_image(img, base=28, round_to_multiple=True):
    # img: PIL.Image
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img).convert("RGB")
    else:
        img = img.convert("RGB")

    w, h = img.size
    
    if min(w, h) < base:
        scale = math.ceil(base / min(w, h))
        w, h = w * scale, h * scale
        img = img.resize((w, h), Image.BICUBIC)

    
    if round_to_multiple:
        new_w = int(math.ceil(img.width  / base) * base)
        new_h = int(math.ceil(img.height / base) * base)
        if (new_w, new_h) != img.size:
            img = img.resize((new_w, new_h), Image.BICUBIC)

    return img


def ensure_images_ok(images):
    fixed = []
    for im in images:
        if isinstance(im, (str, bytes)):  # 경로일 경우
            im = Image.open(im).convert("RGB")
        fixed.append(fix_tiny_image(im, base=28, round_to_multiple=True))
    return fixed

In [48]:
import json
import os
from tqdm import tqdm
from PIL import Image
from datetime import datetime
import torch
import torch.nn.functional as F

IS_TEST = False
OUTPUT_ROOT = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output"

def attention_entropy(attn_weights):
    """
    attn_weights: (B, num_heads, tgt_len, src_len)
    return: (B, num_heads) 평균 entropy (tgt_len 평균)
    """
    # entropy = -sum(p log p)
    ent = -(attn_weights * torch.clamp(attn_weights, min=1e-9).log()).sum(dim=-1)  # (B, H, tgt_len)
    ent = ent.mean(dim=-1)  # 평균 over tgt_len
    return ent  # (B, H)

def run_generation_w_logits(model, proc, tok, joint_loader, model_type, split):
    device = model.device

    cur_time = datetime.now().strftime("%m%d_%H%M")
    save_dir  = os.path.join(OUTPUT_ROOT, f"{model_type}_{DATASET}")
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f"logits_{split}_{cur_time}.json")

    all_results = []
    batch_cnt = 0

    with torch.no_grad():
        for batch in tqdm(joint_loader):
            idxs, images, questions, gold_answers, labels, image_paths = batch
            prompts = [build_prompt(tok, q, model_type) for q in questions]
            images = ensure_images_ok(images)
            inputs = proc(
                images=images,
                text=prompts,
                padding=True,
                return_tensors="pt"
            ).to(device)

            # 1) 먼저 logits/attention 뽑기
            outputs = model(
                **inputs,
                output_attentions=True,
                return_dict=True
            )
            
            # logits (마지막 layer)
            logits = outputs.logits[:, -1, :]   # (B, V)
            logprobs = F.log_softmax(logits, dim=-1)

            # attention: list of layer outputs
            # 각 layer: (B, num_heads, tgt_len, src_len)
            attentions = outputs.attentions

            # head별 entropy 계산 (마지막 layer만 예시로)
            attn_last = attentions[-1]   # (B, H, tgt_len, src_len)
            entropies = attention_entropy(attn_last)  # (B, H)

            # 모델 generation 호출 (텍스트 생성)
            gen_out = model.generate(
                **inputs,
                use_cache=True,  
                max_new_tokens=5,  
                return_dict_in_generate=True,
                output_scores=True
            )

            sequences = gen_out.sequences
            scores = gen_out.scores
            logits_0  = scores[0]    
            logprobs_0 = F.log_softmax(logits_0, dim=-1)   # (B, V)

            batch_size = sequences.size(0)
            start_pos = sequences.size(1) - len(scores)
            first_token_ids = sequences[:, start_pos]

            logit_means = logits_0.mean(dim=-1)            
            logit_mins  = logits_0.min(dim=-1).values      
            logit_maxs  = logits_0.max(dim=-1).values      
            
            gather_ids       = first_token_ids.unsqueeze(1)             
            first_logprobs   = torch.gather(logprobs_0, 1, gather_ids).squeeze(1)  

            model_answers = []
            for i in range(batch_size):  
                gen_ids = sequences[i, start_pos:] 
                text = tok.decode(gen_ids, skip_special_tokens=True)
                model_answers.append(text)

            for i in range(batch_size):
                rec = {
                    "idx":             int(idxs[i]),
                    "question":        questions[i],
                    "image_path":      image_paths[i],
                    "gold_answer":     gold_answers[i],
                    "model_answer":    model_answers[i],
                    "hallu_label":     labels[i],
                    "logit_mean":      float(logit_means[i].item()),
                    "logit_min":       float(logit_mins[i].item()),
                    "logit_max":       float(logit_maxs[i].item()),
                    "logprob":         float(first_logprobs[i].item()),
                    # head별 attention entropy 저장
                    "attn_entropy":    entropies[i].tolist()  
                }
                all_results.append(rec)

            with open(save_path, "w", encoding="utf-8") as f:
                json.dump(all_results, f, ensure_ascii=False, indent=4)
                
            batch_cnt += 1
            if IS_TEST and batch_cnt == 1: 
                break
            
        print(f"Saved {len(all_results)} records to {save_path}")


In [8]:
import torch

def calculate_all_features_for_batch(attentions: tuple, num_image_tokens: int) -> torch.Tensor:
    
    # 각 레이어별로 계산된 피처 텐서( (batch, heads, 2) )를 담을 리스트
    layer_features_list = []

    for layer_attention in attentions:
        # layer_attention shape: (batch_size, num_heads, seq_len, seq_len)
        
        last_token_attn = layer_attention[:, :, -1, :]
        # last_token_attn shape: (batch_size, num_heads, seq_len)
        
        # --- 피처 1: 어텐션 엔트로피 계산 ---
        attention_entropy = -torch.sum(
            last_token_attn * torch.log2(last_token_attn + 1e-9),
            dim=-1 # 마지막 차원(seq_len)에 대해 합산
        )
        # attention_entropy shape: (batch_size, num_heads)
        
        # --- 피처 2: (텍스트 어텐션 합) - (이미지 어텐션 합) 계산 ---
        # 이미지 토큰 부분의 어텐션 값 합산
        image_attn_sum = torch.sum(last_token_attn[:, :, :num_image_tokens], dim=-1)
        
        # 텍스트 토큰 부분의 어텐션 값 합산
        text_attn_sum = torch.sum(last_token_attn[:, :, num_image_tokens:], dim=-1)
        
        attention_diff = text_attn_sum - image_attn_sum
        # attention_diff shape: (batch_size, num_heads)

        # 2개의 피처를 마지막 차원으로 합쳐 (batch_size, num_heads, 2) 모양의 텐서 생성
        layer_features = torch.stack([attention_entropy, attention_diff], dim=-1)
        layer_features_list.append(layer_features)

    # 모든 레이어의 피처 리스트를 쌓아 최종 텐서 생성
    # dim=1을 기준으로 쌓아 (batch_size, num_layers, num_heads, 2) 모양을 만듭니다.
    final_features_tensor = torch.stack(layer_features_list, dim=1)
    
    return final_features_tensor

In [9]:
import json
import os
from tqdm import tqdm
from PIL import Image
from datetime import datetime
import torch
import torch.nn.functional as F

IS_TEST = False
OUTPUT_ROOT = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output"


def generate_with_attention(model, proc, tok, joint_loader, model_type, split, subset_size):
    num_samples = subset_size
    config = model.config
    image_size = config.vision_config.image_size
    patch_size = config.vision_config.patch_size
    
    num_patches = (image_size // patch_size) ** 2
    
    num_layers = model.config.text_config.num_hidden_layers  # -> 32
    num_heads = model.config.text_config.num_attention_heads   # -> 32
    num_features = 2    # entorpy, subtraction
    num_image_tokens = num_patches
    
    all_features_tensor = torch.zeros((num_samples, num_layers, num_heads, num_features))
    all_labels_tensor = torch.zeros((num_samples))
    
    device = model.device

    cur_time = datetime.now().strftime("%m%d_%H%M")
    save_dir  = os.path.join(OUTPUT_ROOT, f"{model_type}_{DATASET}")
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f"head_conf_{split}_{cur_time}.json")
    features_path = os.path.join(save_dir, f'train_features_{subset_size}.pt')
    
    current_idx = 0
    batch_cnt = 0
    with torch.no_grad():
        for batch in tqdm(joint_loader):
            
            idxs, images, questions, gold_answers, labels, image_paths = batch
            
  
            batch_size = len(questions)
            
            prompts = [build_prompt(tok, q, model_type) for q in questions]
            images = ensure_images_ok(images)
            inputs = proc(
                images=images,
                text=prompts,
                padding=True,
                return_tensors="pt"
            ).to(device)
            
            outputs = model(
                **inputs,
                output_attentions=True
            )
            attentions = outputs.attentions # (layer1, layer2, layer ...)
            # vision_attentions = outputs.encoder_attentions
            features_for_batch = calculate_all_features_for_batch(attentions, num_image_tokens)
            
            all_features_tensor[current_idx : current_idx + batch_size] = features_for_batch
            all_labels_tensor[current_idx : current_idx + batch_size] = torch.tensor(labels, dtype=torch.float)
            
            
            batch_cnt += 1
            current_idx += batch_size
            
            if IS_TEST and batch_cnt == 1:
                break
            
    torch.save({'features': all_features_tensor, 'labels': all_labels_tensor}, features_path)
    print(f"All features and labels save with {all_labels_tensor.shape} and feature shape is {all_features_tensor.shape}")
            
        

# image token atten

In [67]:
def _get_patch_len_from_inputs_and_model(inputs, model):
    # pixel_values: (B, 3, H, W)
    H, W = inputs["pixel_values"].shape[-2:]
    try:
        patch = getattr(model.get_vision_tower(), "vision_tower", model.get_vision_tower()).config.patch_size
    except Exception:
        patch = getattr(model.config.vision_config, "patch_size", 14)
    V = (H // patch) * (W // patch)
    return V

def _build_text_image_key_masks(tokenizer, inputs, attentions, model):
    """
    반환:
      text_mask, image_mask: (B, K)  — K는 확장 후 키 길이(attentions[0].size(-1))
    """
    input_ids = inputs['input_ids']
    print(f"inputs: {input_ids.shape}")
    print(f"attentions: {len(attentions)}")
    print(f"attention shape: {attentions[0].shape}")
    device = inputs["input_ids"].device
    B = inputs["input_ids"].size(0)
    K = attentions[0].size(-1)
    V = _get_patch_len_from_inputs_and_model(inputs, model)

    image_token_id = tokenizer.convert_tokens_to_ids("<image>")
    input_ids = inputs["input_ids"]  # (B, L0)

    image_masks = []
    text_masks  = []
    for b in range(B):
        ids = input_ids[b]
        image_pos = (ids == image_token_id).nonzero(as_tuple=False).flatten().tolist()

        key_mask_image = torch.zeros(K, dtype=torch.bool, device=device)
        shift = 0
        for p in image_pos:
            start = p + shift
            end   = min(start + V, K)
            if start < K:
                key_mask_image[start:end] = True
            shift += (V - 1)
        key_mask_text = ~key_mask_image
        image_masks.append(key_mask_image)
        text_masks.append(key_mask_text)

    image_masks = torch.stack(image_masks, dim=0)  # (B, K)
    text_masks  = torch.stack(text_masks,  dim=0)  # (B, K)
    return text_masks, image_masks

In [37]:

import torch

def calculate_attn_sum(
    attentions: tuple,
    num_image_tokens: int,
    last_token_idx_or_vec,
    text_key_mask: torch.Tensor = None,   # (B, K) optional
    image_key_mask: torch.Tensor = None   # (B, K) optional
) -> torch.Tensor:
    """
    반환: (B, num_layers, num_heads, 2) — [..., 0]=image_sum, [..., 1]=text_sum
    - 기존 시그니처 유지하면서 last_token_idx를 *벡터(B,)*도 허용
    - 마스크가 주어지면 정확한 위치로 합산, 없으면 기존 슬라이싱 방식으로 fallback
    """
    layer_features_list = []

    # last_token_idx 처리: scalar -> (B,), already (B,)면 그대로
    if isinstance(last_token_idx_or_vec, torch.Tensor):
        last_idx_vec = last_token_idx_or_vec
    else:
        # scalar라면 B 추정: attentions[0] shape로 K/Q 확인
        B = attentions[0].size(0)
        last_idx_vec = torch.full((B,), int(last_token_idx_or_vec), dtype=torch.long, device=attentions[0].device)

    use_masks = (text_key_mask is not None) and (image_key_mask is not None)

    for layer_attention in attentions:
        # (B, H, Q, K)
        B, H, Q, K = layer_attention.shape

        # 배치별 마지막 쿼리 한 줄씩 뽑기: (B, H, K)
        idx = last_idx_vec.to(layer_attention.device).view(B, 1, 1, 1).expand(B, H, 1, K)  # (B,H,1,K)
        last_token_attn = torch.gather(layer_attention, dim=2, index=idx).squeeze(2)  

        if use_masks:
            # (B,H,K) * (B,1,K) 브로드캐스트 위해 unsqueeze
            image_sum = (last_token_attn * image_key_mask.unsqueeze(1)).sum(dim=-1)  # (B,H)
            text_sum  = (last_token_attn * text_key_mask.unsqueeze(1)).sum(dim=-1)   # (B,H)
        else:
            # === 기존 로직과 최대한 동일한 fallback ===
            # 이미지 토큰이 항상 앞쪽에 연속으로 존재한다고 가정(권장 X)
            image_sum = torch.sum(last_token_attn[:, :, :num_image_tokens], dim=-1)
            text_sum  = torch.sum(last_token_attn[:, :, num_image_tokens:], dim=-1)

        layer_features = torch.stack([image_sum, text_sum], dim=-1)  # (B, H, 2)
        layer_features_list.append(layer_features)

    final_features_tensor = torch.stack(layer_features_list, dim=1)  # (B, L, H, 2)
    return final_features_tensor


In [None]:
import json
import os
from tqdm import tqdm
from PIL import Image
from datetime import datetime
import torch
import torch.nn.functional as F

IS_TEST = True
OUTPUT_ROOT = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output"

# 모든 head별 token, image attention 합 구하기
def generate_with_image_text(model, proc, tok, joint_loader, model_type, split, target_label):
    
    is_hall  = False
    config = model.config
    image_size = config.vision_config.image_size
    patch_size = config.vision_config.patch_size
    
    num_patches = (image_size // patch_size) ** 2
    
    num_layers = model.config.text_config.num_hidden_layers  # -> 32
    num_heads = model.config.text_config.num_attention_heads   # -> 32
    num_image_tokens = num_patches
    print(f"Num image tokens: {num_image_tokens}")
    
    
    device = model.device

    cur_time = datetime.now().strftime("%m%d_%H%M")
    save_dir  = os.path.join(OUTPUT_ROOT, f"{model_type}_{DATASET}")
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f"head_w_txt_img_{target_label}_{split}_{cur_time}.json")
    
    current_idx = 0
    batch_cnt = 0
    
    with open(save_path, 'w', encoding='utf-8') as f:
        with torch.no_grad():
            for batch in tqdm(joint_loader):
                
                idxs, images, questions, gold_answers, labels, image_paths = batch
                batch_size = len(questions)
                cur_lable = labels[0]
                if cur_lable != target_label:
                    continue
                
                prompts = [build_prompt(tok, q, model_type) for q in questions]
                images = ensure_images_ok(images)
                inputs = proc(
                    images=images,
                    text=prompts,
                    padding=True,
                    return_tensors="pt"
                ).to(device)
                
                attention_mask = inputs['attention_mask']
                print(f"atten_shape: {attention_mask.shape}")
                last_token_idx_vec = attention_mask.sum(dim=1) - 1  # (B,)

                
                outputs = model(
                    **inputs,
                    output_attentions=True
                )
                attentions = outputs.attentions # (layer1, layer2, layer ...) -> LLM 레이어 수 tuple
                # 각 tupe의 shape -> (Batch, Head, Qeury_token_num, Key/value_token_num)
                # Q = K = L_exp -> 전체(확장된) 시퀀스의 atention
                # 여기서 q(각 토큰 위치)의 softmax(scores)가 들어있음 
                text_key_mask, image_key_mask = _build_text_image_key_masks(tok, inputs, attentions, model)  # (B,K)

                sums_tensor_batch  = calculate_attn_sum(
                    attentions,
                    num_image_tokens=num_image_tokens,                 # fallback용 그대로 유지
                    last_token_idx_or_vec=last_token_idx_vec,          # (B,)
                    text_key_mask=text_key_mask,
                    image_key_mask=image_key_mask
                )  # (B, L, H, 2)

                
                for b in range(batch_size):
                    for layer_idx in range(num_layers):
                        for head_idx in range(num_heads):
                            image_sum = sums_tensor_batch[b, layer_idx, head_idx, 0].item()
                            text_sum  = sums_tensor_batch[b, layer_idx, head_idx, 1].item()
                            json_line = {
                                "idx": idxs[b],
                                "question": questions[b],
                                "layer": layer_idx,
                                "head": head_idx,
                                "image_attention_sum": image_sum,
                                "text_attention_sum": text_sum,
                                "text_sub_image" : float(text_sum-image_sum)
                            }
                            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
                
                
                batch_cnt += 1
            
                current_idx += batch_size
                if IS_TEST and batch_cnt == 1:
                    break
                
                # if batch_cnt == 5:
                #     break    
                
            
        print(f"\n✅ Analysis finished. Results saved to: {save_path}")
      
        

In [70]:
model_type = "llava1.5"
cur_batch = 1
cur_split = "train"
# subset_size = 100
start_idx = 100

model, proc, tok = load_model(model_type)
joint_loader = load_dataset(cur_split, cur_batch, start_idx=start_idx)

generate_with_image_text(model, proc, tok, joint_loader, model_type, cur_split, target_label=1)

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]


Num image tokens: 576


  0%|          | 1/14891 [00:05<22:38:11,  5.47s/it]

atten_shape: torch.Size([1, 600])
inputs: torch.Size([1, 600])
attentions: 32
attention shape: torch.Size([1, 32, 600, 600])


  0%|          | 1/14891 [00:06<26:00:04,  6.29s/it]


✅ Analysis finished. Results saved to: /data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5_VizWiz/head_w_txt_img_1_train_0924_2015.json





In [None]:
hall_path = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5_VizWiz/head_w_txt_img_1_train_0924_1907.json"
non_hall_path = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5_VizWiz/head_w_txt_img_0_train_0924_1905.json"

head_cnt = 1024
item_cnt = 5

hall_data = []
non_hall_data = []
with open(hall_path, "r") as f:
    for line in f:
        hall_data.append(json.loads(line))

print(f"hallu len: {len(hall_data)}")       

with open(non_hall_path, "r") as f:
    for line in f:
        non_hall_data.append(json.loads(line))
print(f"non-hallu len: {len(non_hall_data)}")       


diffs_label_0 = [[] for _ in range(item_cnt)]
diffs_label_1 = [[] for _ in range(item_cnt)]

cur_item = 0
for item in hall_data:
    if cur_item >= item_cnt:
        break
    
    difference = item['text_attention_sum'] - item['image_attention_sum']
    diffs_label_1[cur_item].append(difference)
    
    if len(diffs_label_1[cur_item]) == head_cnt:
        cur_item += 1
    
cur_item = 0
for item in non_hall_data:
    if cur_item >= item_cnt:
        break
    
    difference = item['text_attention_sum'] - item['image_attention_sum']
    diffs_label_0[cur_item].append(difference)
    
    if len(diffs_label_0[cur_item]) == head_cnt:
        cur_item += 1

for i in range(item_cnt): 
    print("Avg diff on hallucination samples:")
    print(f"[hall] item {i} avg diff:", np.mean(diffs_label_1[i]))
    
for i in range(item_cnt): 
    print("Avg diff on non-hallucination samples:")
    print(f"[non-hall] item {i} avg diff:", np.mean(diffs_label_0[i]))



    

hallu len: 5120
non-hallu len: 5120
Avg diff on hallucination samples:
[hall] item 0 avg diff: 0.17899491311982274
Avg diff on hallucination samples:
[hall] item 1 avg diff: 0.2964126104488969
Avg diff on hallucination samples:
[hall] item 2 avg diff: 0.37307063676416874
Avg diff on hallucination samples:
[hall] item 3 avg diff: 0.36741162091493607
Avg diff on hallucination samples:
[hall] item 4 avg diff: 0.3108724243938923
Avg diff on non-hallucination samples:
[non-hall] item 0 avg diff: 0.22838062653318048
Avg diff on non-hallucination samples:
[non-hall] item 1 avg diff: 0.3602502578869462
Avg diff on non-hallucination samples:
[non-hall] item 2 avg diff: 0.26232808269560337
Avg diff on non-hallucination samples:
[non-hall] item 3 avg diff: 0.23542529810220003
Avg diff on non-hallucination samples:
[non-hall] item 4 avg diff: 0.29833515733480453


In [None]:
hall_path = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5_VizWiz/head_w_txt_img_1_train_0924_1949.json"
non_hall_path = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5_VizWiz/head_w_txt_img_0_train_0924_1948.json"

hall_data = []
non_hall_data = []
THRESHOLD = 0.45

with open(hall_path, "r") as f:
    for line in f:
        hall_data.append(json.loads(line))

print(f"hallu len: {len(hall_data)}")       

with open(non_hall_path, "r") as f:
    for line in f:
        non_hall_data.append(json.loads(line))
print(f"non-hallu len: {len(non_hall_data)}")       


diffs_label_0 = []
diffs_label_0_w_th = []
diffs_label_1 = []
diffs_label_1_w_th = []

for hall, non_hall in zip(hall_data, non_hall_data):
    hall_difference = hall['text_sub_image']
    non_hall_difference = non_hall['text_sub_image']
    
    diffs_label_1.append(difference)
    if difference >= THRESHOLD:
        new_data = {
            "layer": item['layer'],
            "head": item['head'],
            "hallu_diff": difference 
        }
        diffs_label_1_w_th.append(difference)

    
for item in non_hall_data:
    difference = item['text_sub_image']
    diffs_label_0.append(difference)
    if difference >= THRESHOLD:
        diffs_label_0_w_th.append(difference)


print("========== Hallucination samples: ======================")
print(f"Max diff: {np.max(diffs_label_1)}")
print(f"Min diff: {np.min(diffs_label_1)}")
print(f"Avg diff: {np.mean(diffs_label_1)}")

print(f"Number of heads beyond threshod: {len(diffs_label_1_w_th)}")

 
print("========== Non-hallucination samples: ======================")
print(f"Max diff: {np.max(diffs_label_0)}")
print(f"Min diff: {np.min(diffs_label_0)}")
print(f"Avg diff: {np.mean(diffs_label_0)}")

print(f"Number of heads beyond threshod: {len(diffs_label_0_w_th)}")

    

hallu len: 1024
non-hallu len: 1024
Max diff: 0.97613525390625
Min diff: -0.9998898506164551
Avg diff: 0.28105138381943107
Number of heads beyond threshod: 547
Max diff: 0.97576904296875
Min diff: -0.9998712539672852
Avg diff: 0.23986634891480207
Number of heads beyond threshod: 510


In [None]:
model_type = "llava1.5"
cur_batch = 5
cur_split = "train"

model, proc, tok = load_model(model_type)
joint_loader = load_dataset(cur_split, cur_batch, subset_size=5000)

generate_with_attention(model, proc, tok, joint_loader, model_type, cur_split, subset_size=5000)

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.83s/it]
100%|██████████| 1000/1000 [13:21<00:00,  1.25it/s]

All features and labels save with torch.Size([5000]) and feature shape is torch.Size([5000, 32, 32, 2])





In [13]:
model_type = "llava1.5"
cur_batch = 5
cur_split = "val"

model, proc, tok = load_model(model_type)
joint_loader = load_dataset(cur_split, cur_batch, subset_size=1000)

generate_with_attention(model, proc, tok, joint_loader, model_type, cur_split, subset_size=1000)

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.91s/it]
100%|██████████| 200/200 [02:28<00:00,  1.35it/s]

All features and labels save with torch.Size([1000]) and feature shape is torch.Size([1000, 32, 32, 2])





In [25]:
import torch
feature_path = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5_VizWiz/train_features_500.pt"

loaded_data = torch.load(feature_path)
features_tensor = loaded_data['features']
labels_tensor = loaded_data['labels']


print("불러온 피처 텐서의 모양:", features_tensor.shape)
print("불러온 라벨 텐서의 모양:", labels_tensor.shape)
print(features_tensor[0][0][0][1])
print(labels_tensor[0])

불러온 피처 텐서의 모양: torch.Size([500, 32, 32, 2])
불러온 라벨 텐서의 모양: torch.Size([500])
tensor(0.4160)
tensor(1.)


In [9]:
# run generation llava - val
# model_type = "llava1.5"
# cur_batch = 5
# cur_split = "val"

# model, proc, tok = load_model(model_type)
# joint_loader = load_dataset(cur_split, cur_batch)

# run_generation_w_logits(model, proc, tok, joint_loader, model_type, cur_split)

In [10]:
# # run generation qwen - val
# model_type = "instructblip"
# cur_batch = 15
# cur_split = "train"

# model, proc, tok = load_model(model_type)
# joint_loader = load_dataset(cur_split, cur_batch)

# run_generation_w_logits(model, proc, tok, joint_loader, model_type, cur_split)

In [None]:
# run generation instructblip - val
model_type = "llava"
cur_batch = 5
cur_split = "train"
start_idx = 11270

model, proc, tok = load_model(model_type)


joint_loader = load_dataset(cur_split, cur_batch, start_idx)

test=run_generation_w_logits(model, proc, tok, joint_loader, model_type, cur_split)

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/745 [00:03<?, ?it/s]


TypeError: 'NoneType' object is not subscriptable

In [30]:
test.keys()

odict_keys(['logits', 'past_key_values', 'image_hidden_states'])

In [None]:
# generation and scoring
from sklearn.linear_model import SGDClassifier

SGD_TRAIN_PATH = "/data3/KJE/code/WIL_DeepLearningProject_2/VLM_Hallu/output/llava1.5_VizWiz/logits_0916_2251.json"

with open(SGD_TRAIN_PATH, "r", encoding="utf-8") as f:
    train_data = json.load(f)
val_data = pd.read_csv(SGD_VAL_PATH)

df_train = pd.DataFrame(train_data)
df_eval  = pd.DataFrame(eval_data)

# 사용할 단일 특징 목록
FEATURES = ["logit_mean", "logit_min", "logit_max", "logprob"]
TARGET   = "hallu_label"

y_train_raw = df_train[TARGET].values
if np.issubdtype(df_train[TARGET].dtype, np.number):
    le = None
    y_train = y_train_raw.astype(int)
else:
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw)

# 평가용 라벨 (동일 인코더 적용)
if le is None:
    y_eval = df_eval[TARGET].astype(int).values
else:
    # 평가셋 라벨에 학습셋에 없던 클래스가 있으면 예외 발생하므로 주의
    y_eval = le.transform(df_eval[TARGET].values)