In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
import os
import numpy as np
from joblib import Parallel, delayed

# Initialize variables to store overall accuracy results
average_accuracies = []

# Function to process each group (parallelized later)
def process_group(n, group_number):
    train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'
    test_path = f'../../translation/0.result/{group_number}/test_p.csv'

    if not os.path.exists(test_path):
        print(f"Test file for Group {group_number} does not exist. Skipping...")
        return None

    # Load the train_all and test CSVs
    train_all_csv = pd.read_csv(train_all_path, low_memory=False)
    test_csv = pd.read_csv(test_path, low_memory=False)

    train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')
    test_csv['tag_description'] = test_csv['tag_description'].fillna('')

    test_csv['c_thing'] = ''
    test_csv['c_property'] = ''
    test_csv['c_score'] = ''
    test_csv['c_duplicate'] = 0

    combined_tag_descriptions = train_all_csv['tag_description'].tolist()

    # BoW를 Boolean 방식으로 변환
    vectorizer = CountVectorizer(token_pattern=r'\S+', binary=True)
    vectorizer.fit(combined_tag_descriptions)

    train_all_bow_matrix = vectorizer.transform(train_all_csv['tag_description']).toarray().astype(bool)  # bool로 변환
    test_bow_matrix = vectorizer.transform(test_csv['tag_description']).toarray().astype(bool)

    # NearestNeighbors에서 Jaccard 유사도를 사용 (모든 CPU 사용)
    knn = NearestNeighbors(n_neighbors=n, metric='jaccard', n_jobs=-1)  # n_jobs=-1로 모든 CPU 사용
    knn.fit(train_all_bow_matrix)

    distances, indices = knn.kneighbors(test_bow_matrix)

    predicted_things = []
    predicted_properties = []
    predicted_scores = []

    for i in range(len(test_csv)):
        neighbor_index = indices[i][0]
        distance = distances[i][0]

        neighbor_thing = train_all_csv.iloc[neighbor_index]['thing']
        neighbor_property = train_all_csv.iloc[neighbor_index]['property']

        predicted_things.append(neighbor_thing)
        predicted_properties.append(neighbor_property)

        # Jaccard 유사도는 1 - 거리로 계산
        predicted_score = 1 - distance
        predicted_scores.append(predicted_score)

    test_csv['c_thing'] = predicted_things
    test_csv['c_property'] = predicted_properties
    test_csv['c_score'] = predicted_scores

    test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']
    test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']
    test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']

    mdm_true_count = len(test_csv[test_csv['MDM'] == True])
    accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100
    if(n==5):            
        output_path = f'0.class_document/{group_number}/test_p_c.csv'
        test_csv.to_csv(output_path, index=False, encoding='utf-8-sig')

    return accuracy

# Loop through n_neighbors values from 1 to 52
for n in range(5, 6):
    # Parallel processing for groups
    results = Parallel(n_jobs=-1)(delayed(process_group)(n, group_number) for group_number in range(1, 6))

    # Filter out None results (in case of missing files)
    accuracies = [result for result in results if result is not None]

    if accuracies:
        average_accuracy = sum(accuracies) / len(accuracies)
        average_accuracies.append(average_accuracy)
        print(f"Average Accuracy (MDM=True) across all groups with n_neighbors={n}: {average_accuracy:.2f}%")

# Print overall results for all n_neighbors values
print("\nFinal Results:")
for n, avg_accuracy in zip(range(1, 53), average_accuracies):
    print(f"n_neighbors={n}, Average Accuracy: {avg_accuracy:.2f}%")
    


Average Accuracy (MDM=True) across all groups with n_neighbors=5: 86.09%

Final Results:
n_neighbors=1, Average Accuracy: 86.09%
