# Library 설치

## 1_Model 다운로드

In [None]:
import gensim.downloader as api

# 구글의 Word2Vec 사전 학습 모델 Colab에서 연결 Error 발생 -> Local에서 하면 문제 없음
model = api.load('word2vec-google-news-300')

## 2_Word2Vec을 활용한 Document Embedding

In [None]:
import re
from gensim.utils import simple_preprocess

def get_word_vectors(sentence, model):
    words = simple_preprocess(sentence)
    words = [word for word in words if word in model.index_to_key]
    vectors = []
    for word in words:
        if word in model.index_to_key:
            vectors.append(model[word])
    return vectors

In [None]:
def get_document_embedding(sentence, model):
    vectors = get_word_vectors(sentence, model)
    if not vectors:
        return None
    document_embedding = np.mean(vectors, axis=0)
    return document_embedding


In [None]:
sentence = 'The cat is sleeping on the bed.'
document_embedding = get_document_embedding(sentence, model)

words = simple_preprocess(sentence)
words = [word for word in words if word in model.index_to_key]

### 시각화

In [None]:
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA


ref_words = ["Sentence"] + words # 시각화할 단어와 유사한 단어들
vectors = [document_embedding] + [model[word] for word in ref_words[1:]]  # 시각화할 단어와 유사한 단어들의 벡터값

pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(vectors)


fig, ax = plt.subplots()
for word, vector in zip(ref_words, vectors_2d):
    ax.annotate(word, vector)  # 단어 이름을 벡터값 위에 표시
ax.scatter(vectors_2d[:, 0], vectors_2d[:, 1])  # 벡터값으로 산점도 그리기

plt.scatter(vectors_2d[0, 0], vectors_2d[0, 1], color='red')
plt.show()

## 3_쿼리와 Mean Word Embedding

In [None]:
docs = ['The cat is sitting on the mat.',
        'The dog is lying on the rug.',
        'I love pizza and spaghetti for dinner.',
        'I like to drink coffee in the morning.']

docs_embed =[get_document_embedding(sentence, model) for sentence in docs]

qry = 'The cat is sleeping on the bed.'
qry_embed = [get_document_embedding(qry, model)]

ref_words = [qry] + docs # 시각화할 단어와 유사한 단어들
vectors = qry_embed + docs_embed # 시각화할 단어와 유사한 단어들의 벡터값

### 시각화

In [None]:
pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(vectors)

fig, ax = plt.subplots()
for word, vector in zip(ref_words, vectors_2d):
    ax.annotate(word, vector)  # 단어 이름을 벡터값 위에 표시
ax.scatter(vectors_2d[:, 0], vectors_2d[:, 1])  # 벡터값으로 산점도 그리기

plt.scatter(vectors_2d[0, 0], vectors_2d[0, 1], color='red')
plt.show()