In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [None]:
import contextlib
import requests
import torch
import transformers
from PIL import Image
from pathlib import Path

In [None]:
model_name = "llava-hf/llava-1.5-7b-hf"

model = transformers.LlavaForConditionalGeneration.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=transformers.BitsAndBytesConfig(load_in_8bit=True),
)
processor = transformers.AutoProcessor.from_pretrained(model_name)

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

import psutil
import time
from tqdm import tqdm

# Load tensors
def load_tensors(folder):
    data = []
    for file in tqdm(os.listdir(folder)):
        memory_usage = psutil.virtual_memory().percent
        if memory_usage > 90:
            print("Memory usage exceeded 90%. Breaking the loop.")
            break
        if file.endswith(".pt"):  # Assuming tensors are saved as .pt files
            label = file.split('[')[1].split(']')[0]
            try:
                tensor = torch.load(os.path.join(folder, file), map_location='cpu')
            except:
                continue
            data.append((label, tensor))
    return data

def reduce_dimension(data, method="PCA", n_components=2):
    flattened_data = []
    for label, tensor in data:
        mean_activation = tensor.mean(dim=0)  # Mean pooling over tokens
        flattened_data.append(mean_activation.detach().numpy())
        labels.append(label)

    flattened_data = np.array(flattened_data)
    if method == "PCA":
        reducer = PCA(n_components=n_components)
    elif method == "TSNE":
        reducer = TSNE(n_components=n_components)
    elif method == "UMAP":
        reducer = umap.UMAP(n_components=n_components, random_state=42)
    else:
        raise ValueError("Unsupported reduction method")
    reduced_data = reducer.fit_transform(flattened_data)
    return reduced_data, labels
    
# Plot
def plot_2d(data, labels, title="Hidden States Visualization"):
    unique_labels = list(set(labels))
    colors = plt.cm.get_cmap("tab10", len(unique_labels))
    plt.figure(figsize=(10, 8))

    for i, label in enumerate(unique_labels):
        indices = [j for j, lbl in enumerate(labels) if lbl == label]
        plt.scatter(data[indices, 0], data[indices, 1], label=label, color=colors(i))

    plt.title(title)
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.legend()
    plt.show()

In [None]:
folder_path = "/raid/lawrence/hidden_states/"

In [None]:
tensors = []
for file in tqdm(os.listdir(folder_path)):
    data = np.load(folder_path + file)
    tensor = torch.from_numpy(data['hidden_states'])
    tensors.append(tensor)

In [None]:
from collections import defaultdict
def filter(data):
    print(len(data))
    label_to_values = defaultdict(list)
    for label, value in data:
        label_to_values[label].append(value)

    # print([len(value) for key, value in label_to_values.items()])
    
    # Step 2: Filter labels with more than one value
    filtered_labels = {label for label, values in label_to_values.items() if len(values) > 10}
    
    # Step 3: Filter the original list
    filtered_data = [(label, value) for label, value in data if label in filtered_labels]
    print(len(filtered_data))
    return filtered_data

filtered_data = filter(data)
reduced_data, labels = reduce_dimension(filtered_data, method="UMAP")  # or method="TSNE"
plot_2d(reduced_data, labels)

chosen = [str(i) for i in range(55,65)]
filtered_special = [(label, value) for label, value in data if label in chosen]
reduced_data, labels = reduce_dimension(filtered_special, method="PCA")  # or method="TSNE"
plot_2d(reduced_data, labels)