In [1]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [1]:
# pip install gensim
from gensim.models import Word2Vec

# ===== 1) Prepare data =====
sentences = [
    "there is an apple on the table",
    "an orange is on the desk",
    "i love natural language processing",
    "word embeddings capture semantics",
]
sentences = [s.split() for s in sentences]


# ===== 2) train Word2Vec：CBOW（sg=0） =====
model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=0,
    negative=10,
    epochs=20,
    workers=4
)

# ===== 3) Use：vector / most similar  =====
wv = model.wv  # KeyedVectors
print("vector(dim=100) of 'apple':", wv["apple"][:8])

print("most similar to 'apple':", wv.most_similar("apple", topn=5))

# ===== 4) sentence vector Sample =====
import numpy as np
def sent_vec(tokens, wv):
    vecs = [wv[w] for w in tokens if w in wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(wv.vector_size)

v1 = sent_vec("there is an apple".split(), wv)
v2 = sent_vec("an orange is on the desk".split(), wv)
cos = float(np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2)+1e-9))
print("cosine(sent1, sent2) =", cos)

# ===== 5) Save / reload =====
wv.save_word2vec_format("cbow_vectors.txt")
# from gensim.models import KeyedVectors
# wv2 = KeyedVectors.load_word2vec_format("cbow_vectors.txt")

vector(dim=100) of 'apple': [ 8.1269052e-03 -4.4467677e-03 -1.0682202e-03  1.0020925e-03
 -1.8470736e-04  1.1437262e-03  6.1126892e-03 -4.3767650e-06]
most similar to 'apple': [('embeddings', 0.1642715334892273), ('on', 0.146149680018425), ('i', 0.050561174750328064), ('the', 0.04206450283527374), ('semantics', 0.03560653701424599)]
cosine(sent1, sent2) = 0.4240131287316909


In [3]:
import gensim.downloader as api

# 事前学習済み GloVe (50次元) をロード
glove = api.load("glove-wiki-gigaword-50")

print("ベクトル次元:", glove["cat"].shape)
print("cat vs dog 類似度:", glove.similarity("cat", "dog"))
print("king - man + woman ≈", glove.most_similar(positive=["king","woman"], negative=["man"], topn=1))


ベクトル次元: (50,)
cat vs dog 類似度: 0.9218005
king - man + woman ≈ [('queen', 0.8523604273796082)]


In [4]:
glove["cat"]

array([ 0.45281 , -0.50108 , -0.53714 , -0.015697,  0.22191 ,  0.54602 ,
       -0.67301 , -0.6891  ,  0.63493 , -0.19726 ,  0.33685 ,  0.7735  ,
        0.90094 ,  0.38488 ,  0.38367 ,  0.2657  , -0.08057 ,  0.61089 ,
       -1.2894  , -0.22313 , -0.61578 ,  0.21697 ,  0.35614 ,  0.44499 ,
        0.60885 , -1.1633  , -1.1579  ,  0.36118 ,  0.10466 , -0.78325 ,
        1.4352  ,  0.18629 , -0.26112 ,  0.83275 , -0.23123 ,  0.32481 ,
        0.14485 , -0.44552 ,  0.33497 , -0.95946 , -0.097479,  0.48138 ,
       -0.43352 ,  0.69455 ,  0.91043 , -0.28173 ,  0.41637 , -1.2609  ,
        0.71278 ,  0.23782 ], dtype=float32)