In [3]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.svm import SVC
from sklearn.metrics import pairwise_distances
import os
import numpy as np
from joblib import Parallel, delayed

# Function to compute the average Word2Vec vector for a sentence
def compute_sentence_vector(sentence, model, vector_size):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Initialize variables to store overall accuracy results
average_accuracies = {}

# Function to process each group (parallelized later)
def process_group(C_value, group_number):
    train_all_path = f'../../data_preprocess/dataset/{group_number}/train_all.csv'
    test_path = f'../../translation/0.result/{group_number}/test_p.csv'

    if not os.path.exists(test_path):
        print(f"Test file for Group {group_number} does not exist. Skipping...")
        return None

    # Load the train_all and test CSVs
    train_all_csv = pd.read_csv(train_all_path, low_memory=False)
    test_csv = pd.read_csv(test_path, low_memory=False)

    train_all_csv['tag_description'] = train_all_csv['tag_description'].fillna('')
    test_csv['tag_description'] = test_csv['tag_description'].fillna('')

    test_csv['c_thing'] = ''
    test_csv['c_property'] = ''
    test_csv['c_score'] = ''
    test_csv['c_duplicate'] = 0

    combined_tag_descriptions = train_all_csv['tag_description'].tolist() + test_csv['tag_description'].tolist()
    sentences = [desc.split() for desc in combined_tag_descriptions]
    
    vector_size = 200  # 벡터 크기 설정
    model = Word2Vec(sentences, vector_size=vector_size, window=3, min_count=1, workers=-1)

    # Train data vectors
    train_all_vectors = np.array([compute_sentence_vector(desc, model, vector_size) for desc in train_all_csv['tag_description']])
    # Test data vectors
    test_vectors = np.array([compute_sentence_vector(desc, model, vector_size) for desc in test_csv['tag_description']])

    # SVM 모델 학습 및 예측
    svm_model_thing = SVC(kernel='linear', probability=True, C=C_value)
    svm_model_property = SVC(kernel='linear', probability=True, C=C_value)

    # SVM을 이용하여 'thing' 및 'property' 예측 모델 학습
    svm_model_thing.fit(train_all_vectors, train_all_csv['thing'])
    svm_model_property.fit(train_all_vectors, train_all_csv['property'])

    # 'thing' 및 'property' 예측
    predicted_things = svm_model_thing.predict(test_vectors)
    predicted_properties = svm_model_property.predict(test_vectors)
    
    predicted_scores_thing = svm_model_thing.predict_proba(test_vectors)[:, 1]  # 'thing'의 예측 확률 점수
    predicted_scores_property = svm_model_property.predict_proba(test_vectors)[:, 1]  # 'property'의 예측 확률 점수

    predicted_scores = (predicted_scores_thing + predicted_scores_property) / 2  # 평균 점수로 결합

    test_csv['c_thing'] = predicted_things
    test_csv['c_property'] = predicted_properties
    test_csv['c_score'] = predicted_scores

    test_csv['cthing_correct'] = test_csv['thing'] == test_csv['c_thing']
    test_csv['cproperty_correct'] = test_csv['property'] == test_csv['c_property']
    test_csv['ctp_correct'] = test_csv['cthing_correct'] & test_csv['cproperty_correct']

    mdm_true_count = len(test_csv[test_csv['MDM'] == True])
    accuracy = (test_csv['ctp_correct'].sum() / mdm_true_count) * 100 if mdm_true_count > 0 else 0
    return accuracy

# C 값들에 대해 실험할 값 설정 (log 스케일)
C_values = [0.1, 1, 10, 100]
C_values = [1000, 10000, 100000, 1000000]
C_values = [10000000, 100000000, 1000000000, 10000000000]

# 각 C 값에 대해 실험
for C_value in C_values:
    print(f"Running SVM with C={C_value}")
    average_accuracies[C_value] = []

    # Parallel processing for groups
    results = Parallel(n_jobs=-1)(delayed(process_group)(C_value, group_number) for group_number in range(1, 6))

    # Filter out None results (in case of missing files)
    accuracies = [result for result in results if result is not None]

    if accuracies:
        average_accuracy = sum(accuracies) / len(accuracies)
        average_accuracies[C_value].append(average_accuracy)
        print(f"Average Accuracy (MDM=True) across all groups with C={C_value}: {average_accuracy:.2f}%")

# Print overall results for all C values
print("\nFinal Results for each C value:")
for C_value, accuracies in average_accuracies.items():
    avg_acc = np.mean(accuracies)
    print(f"C={C_value}, Average Accuracy: {avg_acc:.2f}%")


Running SVM with C=10000000
Average Accuracy (MDM=True) across all groups with C=10000000: 86.77%
Running SVM with C=100000000
Average Accuracy (MDM=True) across all groups with C=100000000: 86.64%
Running SVM with C=1000000000
Average Accuracy (MDM=True) across all groups with C=1000000000: 86.68%
Running SVM with C=10000000000
Average Accuracy (MDM=True) across all groups with C=10000000000: 86.90%

Final Results for each C value:
C=10000000, Average Accuracy: 86.77%
C=100000000, Average Accuracy: 86.64%
C=1000000000, Average Accuracy: 86.68%
C=10000000000, Average Accuracy: 86.90%
