In [2]:
import torch
import numpy as np

def load_and_print_embeddings(file_path):
    data = torch.load(file_path).cpu().numpy()
    return data

In [3]:
def euclidean_distance(test_sample, train_sample):
    squared_sum = np.sum((test_sample - train_sample) ** 2)
    return np.sqrt(squared_sum)

def Cosine_distance(test_sample, train_sample):
    dot_product = np.dot(test_sample, train_sample)

    norm_text = np.linalg.norm(test_sample)
    norm_train = np.linalg.norm(train_sample)
    
    if norm_text == 0 or norm_train == 0:
        return 1.0

    cosine_similarity = dot_product / (norm_text * norm_train)
    
    cosine_distance = 1 - cosine_similarity
    return cosine_distance

In [4]:
train_embeddings = load_and_print_embeddings('SMAI A1-20250202T180732Z-001/SMAI A1/train_embeddings.pth')  # Replace with the actual path to your .pth file
test_embeddings = load_and_print_embeddings('SMAI A1-20250202T180732Z-001/SMAI A1/test_embeddings.pth')
train_labels = load_and_print_embeddings('SMAI A1-20250202T180732Z-001/SMAI A1/train_labels.pth')
test_labels = load_and_print_embeddings('SMAI A1-20250202T180732Z-001/SMAI A1/test_labels.pth')
text_embeddings = load_and_print_embeddings('SMAI A1-20250202T180732Z-001/SMAI A1/text_embedding.pth')


In [6]:
def knn(train_embeddings, train_labels, test_embeddings, k_values, distance_metric):
    pred_labels_1 = []
    pred_labels_5 = []
    pred_labels_10 = []

    for test_sample in test_embeddings:
        distances_list = []

        for idx, train_sample in enumerate(train_embeddings):
            if distance_metric == "euclidean" :
                distance = euclidean_distance(test_sample, train_sample)
            elif distance_metric == "cosine" :
                distance = Cosine_distance(test_sample, train_sample)
            distances_list.append((distance, idx))

        distances_list.sort(key=lambda x: x[0])

        for k in k_values:
            knn_indices = []
            for i in range(k):
                knn_indices.append(distances_list[i][1])

            knn_labels = train_labels[knn_indices]

            label_counts = {}
            for label in knn_labels:
                if label in label_counts:
                    label_counts[label] += 1
                else:
                    label_counts[label] = 1

            major_label = max(label_counts, key=label_counts.get)

            if k == 1:
                pred_labels_1.append(major_label)
            elif k == 5:
                pred_labels_5.append(major_label)
            elif k == 10:
                pred_labels_10.append(major_label)

    return np.array(pred_labels_1), np.array(pred_labels_5), np.array(pred_labels_10)

In [7]:
def cal_accuracy(predicted_labels, test_labels):
    true_count = 0

    for pred, actual in zip(predicted_labels, test_labels):
        if pred == actual:
            true_count += 1

    return true_count/len(test_labels)

In [8]:
k_values = [1, 5, 10]
metrics = ["cosine", "euclidean"]
for metric in metrics:
    print(f"The metric is {metric}")
    predicted_labels_1, predicted_labels_5, predicted_labels_10 = knn(train_embeddings, train_labels, test_embeddings, k_values, metric)
    accuracy_1 = cal_accuracy(predicted_labels_1, test_labels)
    accuracy_5 = cal_accuracy(predicted_labels_5, test_labels)
    accuracy_10 = cal_accuracy(predicted_labels_10, test_labels)

    print(f"Accuracy for k=1: {accuracy_1:.4f}")
    print(f"Accuracy for k=5: {accuracy_5:.4f}")
    print(f"Accuracy for k=10: {accuracy_10:.4f}")

The metric is cosine
Accuracy for k=1: 0.9048
Accuracy for k=5: 0.9189
Accuracy for k=10: 0.9207
The metric is euclidean
Accuracy for k=1: 0.9048
Accuracy for k=5: 0.9190
Accuracy for k=10: 0.9207


In [9]:
def knn(text_embeddings, train_labels, test_embeddings, k):

    pred_labels = []
    for test_sample in test_embeddings:
        distances_list = []

        for idx, text_sample in enumerate(text_embeddings):
            distance = Cosine_distance(test_sample, text_sample)
            distances_list.append((distance, idx))

        distances_list.sort(key=lambda x: x[0])

        nearest_index = distances_list[0][1]

        predicted_label = nearest_index

        pred_labels.append(predicted_label)

    return np.array(pred_labels)

In [10]:
predicted_labels = knn(text_embeddings, train_labels, test_embeddings, k=1)

final_accuracy = cal_accuracy(predicted_labels, test_labels)

print(f"Accuracy: {final_accuracy:.4f}")

Accuracy: 0.8781
