In [1]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import os
import ast
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

In [2]:
def parse_contributions(file_path):
    data = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("Class"):
                class_name = line.split(": ")[1].strip()
            elif line.startswith("high activating"):
                json_part = line.split("high activating:")[1].strip()
                high_activating_dict = ast.literal_eval(json_part)
                data[class_name] = high_activating_dict
    
    return data

In [3]:
def prepare_data_for_knn(data, top_n=10):
    vectorizer = TfidfVectorizer()
    high_activating_text = []
    
    for class_info in data.values():
        top_concepts = list(class_info.items())[:top_n]
        concepts_text = ' '.join([concept for concept, _ in top_concepts]) # concept1 concept2 concept3 ...
        high_activating_text.append(concepts_text)
    
    X = vectorizer.fit_transform(high_activating_text)
    X = normalize(X)
    
    return X

In [4]:
def find_similar_classes(class_name, data, X, k=5):
    class_idx = list(data.keys()).index(class_name)
    class_vector = X[class_idx]

    knn = NearestNeighbors(n_neighbors=k+1, metric='cosine')
    knn.fit(X)

    distances, indices = knn.kneighbors([class_vector])
    similar_class_names = [list(data.keys())[i] for i in indices[0] if list(data.keys())[i] != class_name]

    return similar_class_names, distances[0][1:]

In [5]:
load_dir = '/data/psh68380/repos/Video-CBM_two-stream/result/vmae/kinetics400/triple_final_ver/model/kinetics400_cbm_2024_11_09_21_06'

file_path = os.path.join(load_dir, 'contribution.txt')
knn_output = os.path.join(load_dir, 'knn.txt')

data = parse_contributions(file_path)
X = prepare_data_for_knn(data)

similar_classes, distances = find_similar_classes("abseiling", data, X, k=5)

print("Most similar classes to 'abseiling':")
for cls, dist in zip(similar_classes, distances):
    print(f"Class: {cls}, Distance: {dist}")

ValueError: Expected 2D array, got 1D array instead:
array=[<1x1390 sparse matrix of type '<class 'numpy.float64'>'
 	with 10 stored elements in Compressed Sparse Row format>].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.