<a href="https://colab.research.google.com/github/joowan1108/Word2Vec/blob/main/Word2VecTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchtext==0.17.0
!pip install torch==2.2.0



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install 'portalocker>=2.0.0'



In [None]:
!python3 /content/drive/MyDrive/LLM/Word2Vec/train.py --config /content/drive/MyDrive/LLM/Word2Vec/config.yaml

Vocabulary size: 1980
Epoch: 1/20, Train Loss=5.28660, Val Loss=5.19919
Epoch: 2/20, Train Loss=5.16724, Val Loss=5.15734
Epoch: 3/20, Train Loss=5.14060, Val Loss=5.15378
Epoch: 4/20, Train Loss=5.13297, Val Loss=5.15552
Epoch: 5/20, Train Loss=5.12819, Val Loss=5.15686
Epoch: 6/20, Train Loss=5.12475, Val Loss=5.14903
Epoch: 7/20, Train Loss=5.11986, Val Loss=5.14419
Epoch: 8/20, Train Loss=5.11523, Val Loss=5.13924
Epoch: 9/20, Train Loss=5.11021, Val Loss=5.13653
Epoch: 10/20, Train Loss=5.10366, Val Loss=5.13615
Epoch: 11/20, Train Loss=5.09638, Val Loss=5.13941
Epoch: 12/20, Train Loss=5.09030, Val Loss=5.12035
Epoch: 13/20, Train Loss=5.08287, Val Loss=5.12824
Epoch: 14/20, Train Loss=5.07487, Val Loss=5.12471
Epoch: 15/20, Train Loss=5.06562, Val Loss=5.11480
Epoch: 16/20, Train Loss=5.05466, Val Loss=5.11645
Epoch: 17/20, Train Loss=5.04303, Val Loss=5.11075
Epoch: 18/20, Train Loss=5.02916, Val Loss=5.11445
Epoch: 19/20, Train Loss=5.01316, Val Loss=5.07166
Epoch: 20/20, Trai

In [None]:
import numpy as np
import pandas as pd
import torch
import sys
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
%cd /content/drive/MyDrive/LLM/Word2Vec

/content/drive/MyDrive/LLM/Word2Vec


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load("weights/skipgram_PennTreebank/model.pt", map_location=device)
vocab = torch.load("weights/skipgram_PennTreebank/vocab.pt")

In [None]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy() #embeddings는 각 vocab의 word embedding을 가짐

# L2 normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape


(1980, 300)

In [None]:
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2) #high dimension --> 2d
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()

In [None]:
color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html("/content/drive/MyDrive/LLM/Word2Vec/word2vec_visualization.html")

In [None]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    #거리로 vector similarity 계산
    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1)) #column vector로 변환
    dists = np.matmul(embeddings_norm, word_vec).flatten() #다른 단어들과의 cosine similarity 계산
    topN_ids = np.argsort(-dists)[1 : topN + 1] #자기 자신을 제외한 나머지를 cosine 값 낮은 순으로 정렬 == 낮을수록 similarity 큼

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [None]:
for word, sim in get_top_similar("know").items():
    print("{}: {:.3f}".format(word, sim))

think: 0.492
really: 0.410
tell: 0.408
you: 0.397
certainly: 0.348
sure: 0.345
want: 0.343
feel: 0.337
ca: 0.334
matter: 0.330


In [None]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

woman: 0.589
thatcher: 0.201
<unk>: 0.198
hopes: 0.182
cases: 0.179
