# FastText

#### Biểu thức hàm mất mát
$$L = -\log(P(w_c|w_t)) - \sum_{n \in N_{t,c}}(1 - P(w_n|w_t)) = \log \left( 1 + e^{-s(w_t, w_c)} \right) + \sum_{n \in N_{t,c}} \log \left( 1 + e^{s(w_t, n)} \right)$$
$$s(w, c) = u_t^{\top} v_c = \sum_{g \in \mathcal{G}_w} z_g^{\top} v_c$$

#### Đạo hàm hàm mất mát (ví dụ theo $u_t$)

$$\dfrac{\partial L}{\partial u_t} = -v_c + \dfrac{v_c}{1 + \exp(-u_t^Tv_c)} + \sum_{n \in N_{t,c}}{(\dfrac{v_n}{1 + exp(-u_t^Tv_n)})}$$

#### Biểu thức xác suất
$$P((w_c | w_t) = \frac{e^{s(w_t, w_c)}}{\sum_{j=1}^{W} e^{s(w_t, j)}}\ \approx P(w_c | w_t) = \frac{1}{1 + \exp\left(-u_t^{\top} v_c\right)}
$$

In [1]:
import numpy as np
from collections import defaultdict
import random 
import re
import pandas as pd

In [5]:
def string_to_list(s):
    s = s.lower()
    cleaned_string = re.sub(r'[^a-zA-Z0-9 ]+', '', s)
    cleaned_string = ' '.join(cleaned_string.split())
    return cleaned_string

def pre_processing(corpus):
    sentences = []
    for sentence in corpus:
        sentences.append((string_to_list(sentence)))
    return sentences

In [67]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))
def generate_ngrams(word, n = 3):
    word = f"<{word}>"
    return [word[i : i + n] for i in range(len(word) - n + 1)]
def build_vocab(corpus, n = 3):
    vocab = defaultdict(int)
    word_to_ngrams = {}
    for sentence in corpus:
        words = sentence.split()
        for word in words:
            ngrams = generate_ngrams(word)
            word_to_ngrams[word] = ngrams
            for ngram in ngrams:
                vocab[ngram] += 1
    return {ngram : idx for idx, ngram in enumerate(vocab)}, word_to_ngrams
def fasttext_loss(v_w, v_c, negative_vectors):
    positive_score = np.dot(v_w, v_c)
    positive_loss = np.log(sigmoid(positive_score))

    negative_loss = 0
    for negative_vector in negative_vectors:
        negative_score = np.dot(v_w, negative_vector)
        negative_loss += np.log(sigmoid(-negative_score))
    return -(positive_loss + negative_loss)
def update_vectors(v_w, v_c, negative_vectors, lr = 0.025):
    grad_positive = sigmoid(np.dot(v_w, v_c)) - 1
    grad_v_w = lr * grad_positive * v_c
    v_c -= lr * grad_positive * v_w
    for negative_vector in negative_vectors:
        grad_negative = sigmoid(np.dot(v_w, negative_vector))
        grad_v_w += lr * grad_negative * negative_vector
        negative_vector -= lr * grad_negative * v_w
    v_w -= grad_v_w
    return v_w, v_c, negative_vectors
# def update_vectors(v_w, v_c, negative_vectors, lr = 0.025):
#     grad_positive = sigmoid(np.dot(v_w, v_c)) - 1
#     v_w -= lr * grad_positive * v_c
#     v_c -= lr * grad_positive * v_w

#     for negative_vector in negative_vectors:
#         grad_negative = sigmoid(np.dot(v_w, negative_vector))
#         negative_vector -= lr * grad_negative * v_w
#         v_w -= lr * grad_negative * negative_vector
#     return v_w, v_c, negative_vectors

In [68]:
def train_fasttext(corpus, vocab, vector_size = 300, window_size = 2, lr = 0.025, epochs = 100, negative_samples = 5):
    vocab_size = len(vocab)
    u_vectors = np.random.uniform(-0.5, 0.5, (vocab_size, vector_size))
    v_vectors = np.random.uniform(-0.5, 0.5, (vocab_size, vector_size))

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        for sentence in corpus:
            words = sentence.split()
            for i, word in enumerate(words):
                ngrams = generate_ngrams(word)
                word_idx = [vocab[ngram] for ngram in ngrams if ngram in vocab]
                for j in range(max(i - window_size, 0), min(i + window_size, len(words))):
                    if i != j:
                        context_word = words[j]
                        context_ngrams = generate_ngrams(context_word)
                        context_word_idx = [vocab[ngram] for ngram in context_ngrams if ngram in vocab]
                        negative_indices = [random.randint(0, vocab_size - 1) for _ in range(negative_samples)]

                        for word_id in word_idx:
                            for context_id in context_word_idx:
                                v_w = u_vectors[word_id]
                                v_c = v_vectors[context_id]
                                negative_vectors = [u_vectors[neg_id] for neg_id in negative_indices]
                                u_vectors[word_id], v_vectors[context_id], _ = update_vectors(v_w, v_c, negative_vectors, lr = lr)
        
    return u_vectors, v_vectors

In [69]:
def find_similar(word, u_vectors, word_to_ngrams, vocab, top_n = 5):
    word_ngrams = generate_ngrams(word)
    word_indices = [vocab[ngram] for ngram in word_ngrams if ngram in vocab]
    if not len(word_indices):
        print("Word not found in vocab!")
        return []
    word_vector = np.mean([u_vectors[idx] for idx in word_indices], axis = 0)
    similarities = {}
    for other_word, ngrams in word_to_ngrams.items():
        ngram_indices = [vocab[ngram] for ngram in ngrams if ngram in vocab]
        if len(ngram_indices) > 0:
            other_word_vector = np.mean([u_vectors[idx] for idx in ngram_indices], axis = 0)
            similarity = np.dot(word_vector, other_word_vector) / (np.linalg.norm(word_vector) * np.linalg.norm(other_word_vector))
            similarities[other_word] = similarity
    similar_words = sorted(similarities.items(), key = lambda item : item[1], reverse=True)[:top_n]
    return similar_words

In [72]:
# Tập dữ liệu
file_path = r".\FastText.xlsx"
df = pd.read_excel(file_path, header = None)
df = df.dropna()
corpus = pre_processing(df[0].tolist())

# Xây dựng từ điển n-gram và liên kết từ với các n-gram
vocab, word_to_ngrams = build_vocab(corpus)

# Huấn luyện mô hình FastText
u_vectors, v_vectors = train_fasttext(corpus, vocab)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [75]:
similar_words = find_similar("peacefully", u_vectors, word_to_ngrams, vocab)

# In ra các từ gần nghĩa nhất
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity:.4f}")

peace: 0.7034
really: 0.4339
who: 0.1998
simply: 0.1972
onward: 0.1714
