In [215]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

In [216]:
def load_w2v(filename):
    w2v = {}
    w2vFile = open(filename, 'r', encoding='utf-8')
    w2v_size = int(w2vFile.readline())
    w2v_dim = int(w2vFile.readline())
    for i in w2vFile:
        s = i.split()
        v = [float(val) for val in s[1:]]
        w2v[s[0].strip()] = v
    w2vFile.close()
    return w2v

In [217]:
word2vec_dict = load_w2v('W2V_150.txt')

In [218]:
def get_word_vector(word: str, model: dict):
    if word not in model:
        return []
    return model[word]

In [219]:
def load_train_data(train_data_path: str) -> list:
    word_pair_list = []
    with open(train_data_path, 'r', encoding='utf-8') as f:
        for line in f:
            splited = line[:-1].split(" ")
            word_pair_list.append([splited[0], splited[1]])

    return word_pair_list

In [220]:
def get_feature(first_word, second_word):
    vector_1 = get_word_vector(first_word, word2vec_dict)
    vector_2 = get_word_vector(second_word, word2vec_dict)

    if not vector_1 or not vector_2:
        return []
    return vector_1 + vector_2


In [229]:
def generate_train_data(antonym_data, synonym_data):
    x_train = []
    y_train = []
    # antonym data, y=0
    for word_pair in antonym_data[0 : round(len(antonym_data)*0.5) ]:
        feature = get_feature(word_pair[0], word_pair[1])
        if not feature:
            continue
        y_train.append(0)
        x_train.append(feature)
    # synomyn data, y=1
    for word_pair in synonym_data[0:round(len(synonym_data)*0.5)]:
        feature = get_feature(word_pair[0], word_pair[1])
        if not feature:
            continue
        y_train.append(1)
        x_train.append(feature)
    return x_train, y_train

In [230]:
def generate_test_data(test_path: str):
    x_test = []
    y_test = []
    word_pair_label_list = []

    with open(test_path, 'r', encoding='utf-8') as f:
        for line in f:
            splited = line[:-1].split("\t")
            word_pair_label_list.append([splited[0], splited[1], splited[2]])

    for word_pair in word_pair_label_list:
        feature = get_feature(word_pair[0], word_pair[1])
        if not feature:
            continue
        y_test.append(1 if word_pair[2] == "SYN" else 0)
        x_test.append(feature)
    return x_test, y_test

In [231]:
antonym_data = load_train_data("./Antonym_vietnamese.txt")
synonym_data = load_train_data("./Synonym_vietnamese.txt")

x_train, y_train = generate_train_data(antonym_data, synonym_data)

x_test, y_test = generate_test_data("./datasets/ViCon-400/400_verb_pairs.txt")


In [234]:
clf = MLPClassifier(random_state=1, max_iter=400).fit(x_train, y_train)

In [235]:
y_pred = clf.predict(x_test)
y_pred_word = clf.predict(np.array([get_feature('đồng_ý', 'từ_chối')]))
print(y_pred_word)
print("Precision score: ", precision_score(y_test, y_pred))
print("Recall score: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))

[0]
Precision score:  0.6622807017543859
Recall score:  0.9869281045751634
F1 score:  0.7926509186351706
