In [1]:
!jupyter --version


Selected Jupyter core packages...
IPython          : 8.28.0
ipykernel        : 6.29.5
ipywidgets       : not installed
jupyter_client   : 8.6.3
jupyter_core     : 5.7.2
jupyter_server   : not installed
jupyterlab       : not installed
nbclient         : not installed
nbconvert        : not installed
nbformat         : not installed
notebook         : not installed
qtconsole        : not installed
traitlets        : 5.14.3


In [2]:
def load_train_data(path: str) -> list:
    word_pair_list = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            word_pair = line[0:(len(line)-1)].split(" ")
            word_pair_list.append([word_pair[0], word_pair[1]])
    return word_pair_list

In [3]:
antonyms_list = load_train_data("antonym-synonym set/Antonym_vietnamese.txt")
synonyms_list = load_train_data("antonym-synonym set/Synonym_vietnamese.txt")

In [4]:
from word2vec import vectorizer
def get_feature_of_pair_word(word1, word2):
    vector1 = vectorizer.get_vec_from_word(word1)
    vector2 = vectorizer.get_vec_from_word(word2)
    
    if vector1 is not None and vector2 is not None:
        return vector1 + vector2
    else:
        return None

In [5]:
def generate_train_data():
    x_train = []
    y_train = []

    for word_pair in synonyms_list:
        feature = get_feature_of_pair_word(word_pair[0], word_pair[1])
        if feature is not None:
            y_train.extend([1])
            x_train.append(feature)

    for word_pair in antonyms_list:
        feature = get_feature_of_pair_word(word_pair[0], word_pair[1])
        if feature is not None:
            y_train.extend([0])
            x_train.append(feature)
    
    return x_train, y_train

In [6]:
def generate_test_data(path):
    x_test = []
    y_test = []
    word_pair_list= []

    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            splited = line[0:-1].split("\t")
            word_pair_list.append([splited[0], splited[1], splited[2]])

    for word_pair in word_pair_list:
        feature = get_feature_of_pair_word(word_pair[0], word_pair[1])
        if feature is not None:
            y_test.extend([1] if word_pair[2] == "SYN" else [0])
            x_test.append(feature)

    return x_test, y_test

In [8]:
x_train, y_train = generate_train_data()

In [9]:
from sklearn.linear_model import LogisticRegression
classifier_model = LogisticRegression()

classifier_model.fit(x_train, y_train)

In [10]:
from sklearn.metrics import classification_report

In [12]:
x_test, y_test = generate_test_data("datasets/ViCon-400/600_adj_pairs.txt")
y_pred = classifier_model.predict(x_test)

print(classification_report(y_test, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.78      0.56      0.65       282
           1       0.60      0.81      0.69       227

    accuracy                           0.67       509
   macro avg       0.69      0.68      0.67       509
weighted avg       0.70      0.67      0.67       509



In [13]:
x_test, y_test = generate_test_data("datasets/ViCon-400/400_noun_pairs.txt")
y_pred = classifier_model.predict(x_test)

print(classification_report(y_test, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.95      0.22      0.35       172
           1       0.53      0.99      0.69       154

    accuracy                           0.58       326
   macro avg       0.74      0.60      0.52       326
weighted avg       0.75      0.58      0.51       326



In [14]:
x_test, y_test = generate_test_data("datasets/ViCon-400/400_verb_pairs.txt")
y_pred = classifier_model.predict(x_test)

print(classification_report(y_test, y_pred, target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.88      0.28      0.42       185
           1       0.52      0.95      0.67       153

    accuracy                           0.58       338
   macro avg       0.70      0.61      0.55       338
weighted avg       0.72      0.58      0.54       338

