# 設定中文環境

In [None]:
# Colab 進行matplotlib繪圖時顯示繁體中文
# 下載台北思源黑體並命名taipei_sans_tc_beta.ttf，移至指定路徑
# 這裡只是為了讓matplitlib能順利顯示中文的設定，不重要，看不懂可以略過
!wget -O TaipeiSansTCBeta-Regular.ttf https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import fontManager

fontManager.addfont('TaipeiSansTCBeta-Regular.ttf')
mpl.rc('font', family='Taipei Sans TC Beta')


# 使用gensim模型

In [None]:
import gensim.downloader as api
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# 下載預訓練模型：word2vec-google-news-300
# 這個模型目前約有3.46GB，這裡會花"一點"時間來下載 :D
model = api.load('word2vec-google-news-300')

In [None]:
# 取得4個單字的vector
words = ['king', 'queen', 'man', 'woman']

In [None]:
# 輸出各單字的前五個向量
for word in words:
    print(f"{word}: {model[word][:5]}")

# 將300維的向量降維成2維，方便視覺化

In [None]:
import matplotlib.cm as cm
from sklearn.manifold import TSNE

# 將清單轉為 NumPy 陣列
vectors = np.array([model[word] for word in words])

# 使用 t-SNE 降維到2D以便視覺化
tsne = TSNE(n_components=2, random_state=42, perplexity=2)
vectors_2d = tsne.fit_transform(vectors)

# 美化顏色而已，看不懂可以不用理會
num_words = len(words)
color_map = cm.get_cmap('tab10')
colors = [color_map(i/num_words) for i in range(num_words)]

# 繪圖
plt.figure(figsize=(16, 10))
plt.tight_layout()
# 繪製從原點到各點的線段
for i, (x, y) in enumerate(vectors_2d):
    plt.plot([0, x], [0, y], color='gray', alpha=0.8, linestyle='-')

# 繪製散點
plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1],
            alpha=0.8, s=200,
            c=colors,  # 使用美化顏色
            edgecolors='black',  # 加上黑邊
            )

# 添加標籤
for i, word in enumerate(words):
    plt.annotate(word,
                 xy=(vectors_2d[i, 0], vectors_2d[i, 1]),
                 xytext=(-25, 10),
                 textcoords='offset points',
                 color=colors[i],  # 使用美化顏色
                 fontsize=19)

plt.xlabel('特徵1', fontsize=16)
plt.ylabel('特徵2', fontsize=16)
plt.grid(True, alpha=0.3)
plt.show()


# 計算各單字間的相關

In [None]:
# 輸出詞對之間的相似度
print("\n詞對相似度：")
for i in range(len(words)):
    for j in range(i+1, len(words)):
        similarity = model.similarity(words[i], words[j])
        print(f"{words[i]} - {words[j]}: {similarity:.3f}")


# 找出與king-man+woman最相近的單字

In [None]:
# 計算向量操作：king - man + woman
result_vector = model['king'] - model['man'] + model['woman']

# 尋找與結果向量最接近的詞語
most_similar = model.most_similar(positive=[result_vector], topn=5)

# 輸出結果
print("與king - man + woman最接近的單字列表：")
for word, score in most_similar:
    print(f"{word}: {score:.3f}")
