In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
import os

# Steering vectors by block

In [52]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict

# --- 설정 ---
excel_path = "path/to/your/excel"
npy_base_dir = "path/to/your/npy"
output_base_dir = "path/to/your/output"
label_col = "label"
prompt_col = "prompt"
seed_col = "seed"
ttype = "query"
steps = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# 엑셀 로드
df = pd.read_excel(excel_path)
os.makedirs(output_base_dir, exist_ok=True)

for step in steps:
    print(f"\n🚀 [STEP {step}]")
    dims_by_label = {0: defaultdict(list), 1: defaultdict(list)}

    for _, row in df.iterrows():
        prompt = row[prompt_col].replace(" ", "_") 
        seed = int(row[seed_col])
        label = int(row[label_col])

        if label not in [0, 1]:
            print(f"잘못된 라벨: {label} (prompt={prompt}, seed={seed})")
            continue

        npy_path = os.path.join(npy_base_dir, f"step{step}", f"{prompt}_seed{seed}.npy")
        if not os.path.exists(npy_path):
            continue

        data = np.load(npy_path, allow_pickle=True).item()
        for block, tensor in data.get(ttype, {}).items():
            if tensor.ndim != 3:
                continue

            parts = block.split(".")
            if parts[0] in {"down_blocks", "up_blocks"}:
                block_name = ".".join(parts[:2]) 
            elif parts[0] == "mid_block":
                block_name = "mid_block"
            else:
                continue  
                
            vec = tensor.mean(axis=(0, 1))  # 평균 벡터 (dim,)
            dims_by_label[label][block_name].append(vec)

    # --- Steering vector 저장 ---
    for block_name in sorted(set(dims_by_label[0]) & set(dims_by_label[1])):
        vecs0 = dims_by_label[0][block_name]
        vecs1 = dims_by_label[1][block_name]

        if not vecs0 or not vecs1:
            print(f"⚠️ [STEP {step}] {block_name} 데이터 부족")
            continue

        vec0_mean = np.mean(np.stack(vecs0), axis=0).astype(np.float32)
        vec1_mean = np.mean(np.stack(vecs1), axis=0).astype(np.float32)
        steering_vector = (vec1_mean - vec0_mean).astype(np.float32)

        safe_name = block_name.replace(".", "_")
        save_path = os.path.join(output_base_dir, f"step{step}_{safe_name}.npy")
        np.save(save_path, steering_vector)

        print(f"✅ [STEP {step}] Saved {save_path} | norm: {np.linalg.norm(steering_vector):.4f}")


🚀 [STEP 1]
✅ [STEP 1] Saved ../steering_vectors_per_block/step1_down_blocks_0.npy | norm: 0.0338
✅ [STEP 1] Saved ../steering_vectors_per_block/step1_down_blocks_1.npy | norm: 0.1445
✅ [STEP 1] Saved ../steering_vectors_per_block/step1_down_blocks_2.npy | norm: 0.5526
✅ [STEP 1] Saved ../steering_vectors_per_block/step1_mid_block.npy | norm: 1.3633
✅ [STEP 1] Saved ../steering_vectors_per_block/step1_up_blocks_1.npy | norm: 0.8611
✅ [STEP 1] Saved ../steering_vectors_per_block/step1_up_blocks_2.npy | norm: 0.2751
✅ [STEP 1] Saved ../steering_vectors_per_block/step1_up_blocks_3.npy | norm: 0.0173

🚀 [STEP 2]
✅ [STEP 2] Saved ../steering_vectors_per_block/step2_down_blocks_0.npy | norm: 0.0408
✅ [STEP 2] Saved ../steering_vectors_per_block/step2_down_blocks_1.npy | norm: 0.1713
✅ [STEP 2] Saved ../steering_vectors_per_block/step2_down_blocks_2.npy | norm: 0.6243
✅ [STEP 2] Saved ../steering_vectors_per_block/step2_mid_block.npy | norm: 1.5614
✅ [STEP 2] Saved ../steering_vectors_per_blo

# block 별 0, 1 분포의 mean 구하기

In [56]:
import os
import numpy as np
import pandas as pd
import torch
from collections import defaultdict

# --- 설정 ---
base_path = "path/to/your/base/path"
excel_path = "path/to/your/excel"
npy_base_dir = "path/to/your/npy" 
output_base_dir = "output/folder/name"
ttype = "query"  # 또는 'key', 'value'
label_col = "label"
prompt_col = "prompt"
seed_col = "seed"
steps = [1, 2, 3, 4, 5]

# --- 엑셀 파일 로드 ---
df = pd.read_excel(os.path.join(base_path, excel_path))

# --- step별로 처리 ---
for step in steps:
    dims_by_label = {0: defaultdict(list), 1: defaultdict(list)}
    step_dir = os.path.join(base_path, npy_base_dir, f"step{step}")

    for _, row in df.iterrows():
        prompt = row[prompt_col].replace(" ", "_")
        seed = int(row[seed_col])
        label = int(row[label_col])

        filename = f"{prompt}_seed{seed}.npy"
        full_path = os.path.join(step_dir, filename)

        if not os.path.exists(full_path):
            print(f"❌ [step={step}] Missing file: {full_path}")
            continue

        data = np.load(full_path, allow_pickle=True).item()

        for block, tensor in data[ttype].items():
            if tensor.ndim != 3:
                continue  # (num_heads, seq_len, dim) 형태만 사용

            parts = block.split(".")
            if parts[0] in {"down_blocks", "up_blocks"}:
                block_name = ".".join(parts[:2])
            elif parts[0] == "mid_block":
                block_name = "mid_block"
            else:
                continue

            vec = tensor.mean(axis=(0, 1))  # (dim,)
            dims_by_label[label][block_name].append(vec)

    # --- 평균 벡터 저장 ---
    mean_output_dir = os.path.join(base_path, output_base_dir, f"step{step}")
    os.makedirs(mean_output_dir, exist_ok=True)

    for label in [0, 1]:
        for block_name, vec_list in dims_by_label[label].items():
            if not vec_list:
                continue
            mean_vec = np.mean(np.stack(vec_list), axis=0).astype(np.float32)
            safe_name = block_name.replace(".", "_").replace("/", "_")
            save_path = os.path.join(mean_output_dir, f"{safe_name}_mean{label}.npy")
            np.save(save_path, mean_vec)
            print(f"📁 [step={step}] Saved label={label} → {block_name} → {save_path} | shape: {mean_vec.shape}")

📁 [step=1] Saved label=0 → down_blocks.0 → /home/michelle/TTI_steering/final/steering_vectors_per_block_per_mean/step1/down_blocks_0_mean0.npy | shape: (320,)
📁 [step=1] Saved label=0 → down_blocks.1 → /home/michelle/TTI_steering/final/steering_vectors_per_block_per_mean/step1/down_blocks_1_mean0.npy | shape: (640,)
📁 [step=1] Saved label=0 → down_blocks.2 → /home/michelle/TTI_steering/final/steering_vectors_per_block_per_mean/step1/down_blocks_2_mean0.npy | shape: (1280,)
📁 [step=1] Saved label=0 → up_blocks.1 → /home/michelle/TTI_steering/final/steering_vectors_per_block_per_mean/step1/up_blocks_1_mean0.npy | shape: (1280,)
📁 [step=1] Saved label=0 → up_blocks.2 → /home/michelle/TTI_steering/final/steering_vectors_per_block_per_mean/step1/up_blocks_2_mean0.npy | shape: (640,)
📁 [step=1] Saved label=0 → up_blocks.3 → /home/michelle/TTI_steering/final/steering_vectors_per_block_per_mean/step1/up_blocks_3_mean0.npy | shape: (320,)
📁 [step=1] Saved label=0 → mid_block → /home/michelle/TT