In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.neighbors import NearestNeighbors
import os

# Function to compute the average Word2Vec vector for a sentence
def compute_sentence_vector(sentence, model, vector_size):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Initialize variables to store overall accuracy results
average_accuracies = []

# Loop through n_neighbors values from 1 to 52
for n in range(1, 53):
    accuracies = []  # Store accuracy for each group

    # Loop through group numbers from 1 to 5
    for group_number in range(1, 6):
        train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'
        test_path = f'../../translation/0.result/{group_number}/test_p.csv'

        if not os.path.exists(test_path):
            print(f"Test file for Group {group_number} does not exist. Skipping...")
            continue

        # Load the train_all and test CSVs
        train_all_csv = pd.read_csv(train_all_path, low_memory=False)
        test_csv = pd.read_csv(test_path, low_memory=False)

        train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')
        test_csv['tag_description'] = test_csv['tag_description'].fillna('')

        test_csv['c_thing'] = ''
        test_csv['c_property'] = ''
        test_csv['c_score'] = ''
        test_csv['c_duplicate'] = 0

        combined_tag_descriptions = train_all_csv['tag_description'].tolist() + test_csv['tag_description'].tolist()

        # Train Word2Vec model on combined descriptions
        sentences = [desc.split() for desc in combined_tag_descriptions]
        vector_size = 200  # You can set the vector size as needed
        model = Word2Vec(sentences, vector_size=vector_size, window=3, min_count=1, workers=-1)

        # Compute Word2Vec vectors for the train and test data
        train_all_vectors = np.array([compute_sentence_vector(desc, model, vector_size) for desc in train_all_csv['tag_description']])
        test_vectors = np.array([compute_sentence_vector(desc, model, vector_size) for desc in test_csv['tag_description']])

        # KNN에서 코사인 거리를 이용
        knn = NearestNeighbors(n_neighbors=n, metric='euclidean', n_jobs=-1)
        knn.fit(train_all_vectors)

        distances, indices = knn.kneighbors(test_vectors)

        predicted_things = []
        predicted_properties = []
        predicted_scores = []

        for i in range(len(test_csv)):
            neighbor_index = indices[i][0]
            distance = distances[i][0]

            neighbor_thing = train_all_csv.iloc[neighbor_index]['thing']
            neighbor_property = train_all_csv.iloc[neighbor_index]['property']

            predicted_things.append(neighbor_thing)
            predicted_properties.append(neighbor_property)

            # 거리 기반으로 유사도 점수 계산
            predicted_score = 1 - distance
            predicted_scores.append(predicted_score)

        test_csv['c_thing'] = predicted_things
        test_csv['c_property'] = predicted_properties
        test_csv['c_score'] = predicted_scores

        test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']
        test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']
        test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']

        mdm_true_count = len(test_csv[test_csv['MDM'] == True])
        accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100
        accuracies.append(accuracy)

    # Calculate the average accuracy for the current n_neighbors value
    average_accuracy = sum(accuracies) / len(accuracies)
    average_accuracies.append(average_accuracy)
    print(f"Average Accuracy (MDM=True) across all groups with n_neighbors={n}: {average_accuracy:.2f}%")

# Print overall results for all n_neighbors values
print("\nFinal Results:")
for n, avg_accuracy in zip(range(1, 53), average_accuracies):
    print(f"n_neighbors={n}, Average Accuracy: {avg_accuracy:.2f}%")


Average Accuracy (MDM=True) across all groups with n_neighbors=1: 85.69%
Average Accuracy (MDM=True) across all groups with n_neighbors=2: 86.04%
Average Accuracy (MDM=True) across all groups with n_neighbors=3: 85.85%
Average Accuracy (MDM=True) across all groups with n_neighbors=4: 85.88%
Average Accuracy (MDM=True) across all groups with n_neighbors=5: 85.84%
Average Accuracy (MDM=True) across all groups with n_neighbors=6: 85.81%
Average Accuracy (MDM=True) across all groups with n_neighbors=7: 85.84%
Average Accuracy (MDM=True) across all groups with n_neighbors=8: 85.86%
Average Accuracy (MDM=True) across all groups with n_neighbors=9: 85.84%
Average Accuracy (MDM=True) across all groups with n_neighbors=10: 85.91%


KeyboardInterrupt: 