In [86]:
import numpy as np
from sklearn.utils import shuffle

In [87]:
word2vec = {}
with open("../word2vec/W2V_150.txt", mode='r', encoding='utf-8') as file:
    next(file)
    next(file)
    lines = file.readlines()
    for line in lines:
        split = line.split()

        word = split[0]
        vec = split[1:]

        word2vec[word] = np.array(vec).astype(float)


In [88]:
interesting = word2vec["thú_vị"]
print(len(interesting))
print(interesting.dtype)

150
float64


In [89]:
fun = word2vec["vui"]

In [90]:
np.dot(fun.T, interesting)

82.74809785022461

## Problem 1:

In [91]:
def cosine_similarity(vec1: np.array, vec2: np.array) -> float:
    numerator = np.sum(np.multiply(vec1, vec2))
    denominator = np.sqrt(np.dot(vec1.T, vec1)) * np.sqrt(np.dot(vec2.T, vec2))

    return numerator / denominator

In [92]:
cosine_similarity(fun, interesting)

0.48814743997004356

## Problem 2:

In [93]:
def KNN(word: str, vocab: dict, k: int = 5) -> np.array:
    distance_list = []
    for w in vocab:
        if w != word:
            distance_list.append((w, cosine_similarity(vocab[word], vocab[w])))
    distance_list = sorted(distance_list, key=lambda x: x[1], reverse=True)

    return np.array(distance_list[:k])

In [94]:
KNN("vả", word2vec)

array([['tát', '0.5022767817350094'],
       ['đấm', '0.4383916557515601'],
       ['bạt_tai', '0.4378275347842416'],
       ['đánh_liên_tiếp', '0.4205850772757408'],
       ['đánh_túi_bụi', '0.41529420942501954']], dtype='<U32')

## Problem 3:

In [95]:
def getDataSet(dataPath: str, vocab: dict) -> np.array:
    dataSet = []
    with open(file=dataPath, mode='r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            split = line.split()
            if (len(split) == 2):
                word1, word2 = line.split()
                vec1 = vocab[word1] if word1 in word2vec else np.zeros(vocab[next(iter(vocab))].shape)
                vec2 = vocab[word2] if word2 in word2vec else np.zeros(vocab[next(iter(vocab))].shape)

                concat = np.concatenate([vec1, vec2], axis=0)
                dataSet.append(concat)
    return np.array(dataSet)

In [96]:
def getTrainSet(isShuffle: bool = True, random_state:int = 0):
    anto = getDataSet('../antonym-synonym set/Antonym_vietnamese.txt', word2vec)
    syno = getDataSet("../antonym-synonym set/Synonym_vietnamese.txt", word2vec)

    y_anto = np.full(shape=len(anto), fill_value=0)
    y_syno = np.full(shape=len(syno), fill_value=1)

    X_train = np.concatenate([syno, anto], axis=0)
    y_train = np.concatenate([y_syno, y_anto], axis=0)

    if isShuffle:
        X_train, y_train = shuffle(X_train, y_train, random_state=random_state)

    return X_train, y_train

In [97]:
def getTestSet(vocab: dict):
    def label(s: str):
        if s == 'SYN': return 1
        elif s == 'ANT': return 0
        else: return -1
    
    X_test = []
    y_test = []
    paths = ["../datasets/ViCon-400/400_noun_pairs.txt",
            "../datasets/ViCon-400/400_verb_pairs.txt",
            "../datasets/ViCon-400/600_adj_pairs.txt"]
    
    for path in paths:
        with open(file=path, mode='r', encoding='utf-8') as f:
            next(f)
            lines = f.readlines()
            for line in lines:
                split = line.split()
                if len(split) == 3:
                    word1, word2, anotation = split
                    vec1 = vocab[word1] if word1 in word2vec else np.zeros(vocab[next(iter(vocab))].shape)
                    vec2 = vocab[word2] if word2 in word2vec else np.zeros(vocab[next(iter(vocab))].shape)

                    X_test.append(np.concatenate([vec1, vec2], axis=0))
                    
                    y_test.append(label(anotation))

    return np.array(X_test), np.array(y_test)

In [98]:
X_train, y_train = getTrainSet()

In [99]:
X_test, y_test = getTestSet(word2vec)

In [100]:
(y_train == 1).sum(), (y_train == 0).sum()

(11558, 2000)

In [104]:
from sklearn.svm import SVC
svc = SVC(class_weight='balanced')

svc.fit(X_train, y_train)

In [105]:
from sklearn.metrics import classification_report
y_pred = svc.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      2000
           1       1.00      0.99      0.99     11558

    accuracy                           0.99     13558
   macro avg       0.97      0.98      0.98     13558
weighted avg       0.99      0.99      0.99     13558



In [106]:
y_pred = svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       700
           1       0.99      0.99      0.99       700

    accuracy                           0.99      1400
   macro avg       0.99      0.99      0.99      1400
weighted avg       0.99      0.99      0.99      1400

