In [7]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from sklearn.cluster import MiniBatchKMeans,KMeans,DBSCAN
from sklearn.manifold import TSNE
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.metrics import rand_score, normalized_mutual_info_score
from sklearn.metrics.pairwise import cosine_distances
from collections import Counter, defaultdict
import ipdb

In [8]:
def load_mapping(mapping_file):
    wnid_to_index = {}
    with open(mapping_file, 'r') as f:
        for line in f:
            wnid, index = line.strip().split('\t')
            wnid_to_index[wnid] = int(index)
    return wnid_to_index

In [9]:
def get_sorted_space(x, y, z):
    init_count = 0
    for count in x:
        count = count + init_count
        y.append(z[init_count:count])
        init_count = count

# Perform Kmeans++ to each class with `n_clusters = 1`

In [10]:
wnid_to_index = load_mapping(os.path.join('ds_inf/tiny-imagenet-200', 'tiny-imagenet-mapping.txt'))
input_dir = '/scratch/zhao.lin1/dataset/tiny-imagenet-200/train_vit_image_feature_CLS'
save_dir = '/scratch/zhao.lin1/dataset/tiny-imagenet-200/key50_far_latents_train_vit_image_feature_CLS'
n_clusters = 1
n_key = 50

In [11]:
latents = []
labels = []
file_paths = []
result = {}  # Initialize result as a dictionary

In [12]:
for class_idx, class_folder in enumerate(os.listdir(input_dir)):
    class_dir = os.path.join(input_dir, class_folder)
    
    if os.path.isdir(class_dir):
        file_paths = []
        latents = []

        # Collect latents and file paths
        for file_name in sorted(os.listdir(class_dir)):
            if file_name.endswith('.pt'):
                file_path = os.path.join(class_dir, file_name)
                file_paths.append(file_path)
                
                latent = torch.load(file_path, weights_only=True)  
                latents.append(latent.cpu().numpy())

        # Process latents if any are found
        if latents:
            class_latents = np.stack(latents).astype(np.float32)
            class_latents_normalized = normalize(class_latents, norm='l2', axis=1)

            # Perform KMeans clustering
            kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0)
            kmeans.fit(class_latents_normalized)
            centroids = kmeans.cluster_centers_

            # Calculate distances to the centroid
            distances = np.linalg.norm(class_latents_normalized - centroids[kmeans.labels_], axis=1)
            
            # Sort by distance and select the top n_key farthest latents
            sorted_indices = np.argsort(distances)[::-1]  # Sort descending by distance
            top_n_indices = sorted_indices[:n_key]  # Get exactly the top n_key indices

            top_n_distances = distances[top_n_indices]

            key = class_folder
            if key not in result:
                result[key] = []

            for i in range(n_key):
                idx = top_n_indices[i]
                file_name = os.path.basename(file_paths[idx]).replace('.pt', '.JPEG')
                
                result[key].append({
                    "index": int(idx),
                    "file_name": file_name,
                    "distance": float(top_n_distances[i])
                })

# Save result as a JSON file
output_json_path = 'ds_inf/vit_Kmeans++_1_cluster_per_class.json'
with open(output_json_path, 'w') as f:
    json.dump(result, f, indent=4)

print(f"Result was saved in {output_json_path}")

Result was saved in ds_inf/vit_Kmeans++_1_cluster_per_class.json


In [14]:
# 读取 JSON 文件
with open('ds_inf/vit_Kmeans++_1_cluster_per_class.json', 'r') as f:
    data = json.load(f)

# 替换 'class_name' 为你要查询的类别名称
class_name = 'n01443537'  # 指定要查询的类别
if class_name in data:
    # 提取该类别下所有的 distance 值
    distances = [entry["distance"] for entry in data[class_name]]
    
    # 获取唯一 distance 值的个数
    unique_distances = set(distances)
    print(f"{class_name} 类别中 unique 的 distance 数量：{len(unique_distances)}")
else:
    print(f"类别 '{class_name}' 不在数据中")

n01443537 类别中 unique 的 distance 数量：50


# Perform Kmeans++ to each class with `n_clusters = 50`

In [18]:
wnid_to_index = load_mapping(os.path.join('ds_inf/tiny-imagenet-200', 'tiny-imagenet-mapping.txt'))
input_dir = '/scratch/zhao.lin1/dataset/tiny-imagenet-200/train_vit_image_feature_CLS'
save_dir = '/scratch/zhao.lin1/dataset/tiny-imagenet-200/key50_far_latents_train_vit_image_feature_CLS'
n_clusters = 50
n_key = 1

In [19]:
latents = []
labels = []
file_paths = []
result = {}  # Initialize result as a dictionary

In [20]:
for class_idx, class_folder in enumerate(os.listdir(input_dir)):
    class_dir = os.path.join(input_dir, class_folder)

    if os.path.isdir(class_dir):
        file_paths = []
        latents = []

        # Collect latents and file paths
        for file_name in sorted(os.listdir(class_dir)):
            if file_name.endswith('.pt'):
                file_path = os.path.join(class_dir, file_name)
                file_paths.append(file_path)
                
                latent = torch.load(file_path, weights_only=True)  
                latents.append(latent.cpu().numpy())

        # Process latents if any are found
        if latents:
            class_latents = np.stack(latents).astype(np.float32)
            class_latents_normalized = normalize(class_latents, norm='l2', axis=1)

            # Perform KMeans clustering
            kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0)
            kmeans.fit(class_latents_normalized)
            centroids = kmeans.cluster_centers_

            # Calculate distances to the centroid for each latent in each cluster
            distances = np.linalg.norm(class_latents_normalized - centroids[kmeans.labels_], axis=1)

            # For each cluster, select the top n_key farthest latents
            for cluster_idx in range(n_clusters):
                # Get indices of latents belonging to the current cluster
                cluster_indices = np.where(kmeans.labels_ == cluster_idx)[0]
                cluster_distances = distances[cluster_indices]
                cluster_file_paths = [file_paths[i] for i in cluster_indices]

                # Sort by distance and select top n_key latents
                sorted_indices = np.argsort(cluster_distances)[::-1]  # Sort descending by distance
                top_n_indices = sorted_indices[:n_key]  # Select top n_key indices

                top_n_distances = cluster_distances[top_n_indices]

                # Ensure class key is in the result dictionary
                key = class_folder
                if key not in result:
                    result[key] = []

                # Append the result for the current cluster
                for i in range(n_key):
                    idx = cluster_indices[top_n_indices[i]]  # Get the original index of the latent
                    file_name = os.path.basename(cluster_file_paths[i]).replace('.pt', '.JPEG')
                    
                    result[key].append({
                        "index": int(idx),
                        "file_name": file_name,
                        "distance": float(top_n_distances[i])
                    })

# Save the result as a JSON file
output_json_path = 'ds_inf/vit_Kmeans++_50_cluster_per_class.json'
with open(output_json_path, 'w') as f:
    json.dump(result, f, indent=4)

print(f"Result was saved in {output_json_path}")

Result was saved in ds_inf/vit_Kmeans++_50_cluster_per_class.json


# 最近的50个，`n_cluster=1'


In [21]:
wnid_to_index = load_mapping(os.path.join('ds_inf/tiny-imagenet-200', 'tiny-imagenet-mapping.txt'))
input_dir = '/scratch/zhao.lin1/dataset/tiny-imagenet-200/train_vit_image_feature_CLS'
save_dir = '/scratch/zhao.lin1/dataset/tiny-imagenet-200/key50_far_latents_train_vit_image_feature_CLS'
n_clusters = 1
n_key = 50

In [22]:
latents = []
labels = []
file_paths = []
result = {}  # Initialize result as a dictionary

In [23]:
for class_idx, class_folder in enumerate(os.listdir(input_dir)):
    class_dir = os.path.join(input_dir, class_folder)

    if os.path.isdir(class_dir):
        file_paths = []
        latents = []

        # Collect latents and file paths
        for file_name in sorted(os.listdir(class_dir)):
            if file_name.endswith('.pt'):
                file_path = os.path.join(class_dir, file_name)
                file_paths.append(file_path)
                
                latent = torch.load(file_path, weights_only=True)  
                latents.append(latent.cpu().numpy())

        # Process latents if any are found
        if latents:
            class_latents = np.stack(latents).astype(np.float32)
            class_latents_normalized = normalize(class_latents, norm='l2', axis=1)

            # Perform KMeans clustering
            kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0)
            kmeans.fit(class_latents_normalized)
            centroids = kmeans.cluster_centers_

            # Calculate distances to the centroid for each latent in each cluster
            distances = np.linalg.norm(class_latents_normalized - centroids[kmeans.labels_], axis=1)

            # For each cluster, select the top n_key farthest latents
            for cluster_idx in range(n_clusters):
                # Get indices of latents belonging to the current cluster
                cluster_indices = np.where(kmeans.labels_ == cluster_idx)[0]
                cluster_distances = distances[cluster_indices]
                cluster_file_paths = [file_paths[i] for i in cluster_indices]

                # Sort by distance and select top n_key latents
                sorted_indices = np.argsort(cluster_distances)  # Sort descending by distance
                top_n_indices = sorted_indices[:n_key]  # Select top n_key indices

                top_n_distances = cluster_distances[top_n_indices]

                # Ensure class key is in the result dictionary
                key = class_folder
                if key not in result:
                    result[key] = []

                # Append the result for the current cluster
                for i in range(n_key):
                    idx = cluster_indices[top_n_indices[i]]  # Get the original index of the latent
                    file_name = os.path.basename(cluster_file_paths[i]).replace('.pt', '.JPEG')
                    
                    result[key].append({
                        "index": int(idx),
                        "file_name": file_name,
                        "distance": float(top_n_distances[i])
                    })

# Save the result as a JSON file
output_json_path = 'ds_inf/vit_Kmeans++_closest_1_cluster_per_class.json'
with open(output_json_path, 'w') as f:
    json.dump(result, f, indent=4)

print(f"Result was saved in {output_json_path}")

Result was saved in ds_inf/vit_Kmeans++_closest_1_cluster_per_class.json


# 质心最近的1个，`n_cluster=50'

In [24]:
wnid_to_index = load_mapping(os.path.join('ds_inf/tiny-imagenet-200', 'tiny-imagenet-mapping.txt'))
input_dir = '/scratch/zhao.lin1/dataset/tiny-imagenet-200/train_vit_image_feature_CLS'
save_dir = '/scratch/zhao.lin1/dataset/tiny-imagenet-200/key50_far_latents_train_vit_image_feature_CLS'
n_clusters = 50
n_key = 1

In [25]:
latents = []
labels = []
file_paths = []
result = {}  # Initialize result as a dictionary

In [26]:
for class_idx, class_folder in enumerate(os.listdir(input_dir)):
    class_dir = os.path.join(input_dir, class_folder)

    if os.path.isdir(class_dir):
        file_paths = []
        latents = []

        # Collect latents and file paths
        for file_name in sorted(os.listdir(class_dir)):
            if file_name.endswith('.pt'):
                file_path = os.path.join(class_dir, file_name)
                file_paths.append(file_path)
                
                latent = torch.load(file_path, weights_only=True)  
                latents.append(latent.cpu().numpy())

        # Process latents if any are found
        if latents:
            class_latents = np.stack(latents).astype(np.float32)
            class_latents_normalized = normalize(class_latents, norm='l2', axis=1)

            # Perform KMeans clustering
            kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0)
            kmeans.fit(class_latents_normalized)
            centroids = kmeans.cluster_centers_

            # Calculate distances to the centroid for each latent in each cluster
            distances = np.linalg.norm(class_latents_normalized - centroids[kmeans.labels_], axis=1)

            # For each cluster, select the top n_key farthest latents
            for cluster_idx in range(n_clusters):
                # Get indices of latents belonging to the current cluster
                cluster_indices = np.where(kmeans.labels_ == cluster_idx)[0]
                cluster_distances = distances[cluster_indices]
                cluster_file_paths = [file_paths[i] for i in cluster_indices]

                # Sort by distance and select top n_key latents
                sorted_indices = np.argsort(cluster_distances)  # Sort descending by distance
                top_n_indices = sorted_indices[:n_key]  # Select top n_key indices

                top_n_distances = cluster_distances[top_n_indices]

                # Ensure class key is in the result dictionary
                key = class_folder
                if key not in result:
                    result[key] = []

                # Append the result for the current cluster
                for i in range(n_key):
                    idx = cluster_indices[top_n_indices[i]]  # Get the original index of the latent
                    file_name = os.path.basename(cluster_file_paths[i]).replace('.pt', '.JPEG')
                    
                    result[key].append({
                        "index": int(idx),
                        "file_name": file_name,
                        "distance": float(top_n_distances[i])
                    })

# Save the result as a JSON file
output_json_path = 'ds_inf/vit_Kmeans++_closest_50_cluster_per_class.json'
with open(output_json_path, 'w') as f:
    json.dump(result, f, indent=4)

print(f"Result was saved in {output_json_path}")

Result was saved in ds_inf/vit_Kmeans++_closest_50_cluster_per_class.json
