<a href="https://colab.research.google.com/github/hongsukyi/Lectures/blob/main/Lectures/%EB%94%A5%EB%9F%AC%EB%8B%9D_%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC%20/%EC%8B%A4%EC%8A%B5/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import urllib.request
import pandas as pd

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

In [None]:
train_dataset = pd.read_table('ratings.txt')

In [None]:
train_dataset[:5]

In [None]:
print(len(train_dataset))

## 편의상 데이터 개수를 줄여주자.
- 150,000개의 10%인 15,000개를 사용하자.

In [None]:
train_dataset = train_dataset[:15000]

In [None]:
print(len(train_dataset))

In [None]:
# 결측치처리
train_dataset.replace("", float("NaN"), inplace=True)
train_dataset = train_dataset.dropna().reset_index(drop=True)

# 중복 제거
train_dataset = train_dataset.drop_duplicates(['document']).reset_index(drop=True)

# 한글이 아닌 문자 제거
train_dataset['document'] = train_dataset['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

#길이가 짧은 데이터 제거
train_dataset['document'] = train_dataset['document'].apply(lambda x: ' '.join([token for token in x.split() if len(token) > 2]))

# 전체 길이가 10 이하이거나 전체 단어 개수가 5개 이하인 데이터를 필터링합니다.
train_dataset = train_dataset[train_dataset.document.apply(lambda x: len(str(x)) > 10 and len(str(x).split()) > 5)].reset_index(drop=True)


In [None]:
print(len(train_dataset))

In [None]:
train_dataset[:5]

In [None]:
train_dataset.shape

In [None]:
!pip install konlpy
from konlpy.tag import Okt

In [None]:
# 불용어 정의
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

train_dataset = list(train_dataset['document'])

# 형태소 분석기 OKT를 사용한 토큰화 작업
okt = Okt()

tokenized_data = []

for sentence in train_dataset:
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    tokenized_data.append(stopwords_removed_sentence)


In [None]:
len(tokenized_data)

In [None]:
print(tokenized_data[:5])

In [None]:
import matplotlib.pyplot as plt
!pip install koreanize-matplotlib
import koreanize_matplotlib

In [None]:
print('리뷰의 최대 길이 :',max(len(review) for review in tokenized_data))
print('리뷰의 평균 길이 :',sum(map(len, tokenized_data))/len(tokenized_data))
plt.hist([len(review) for review in tokenized_data], bins=50)
plt.xlabel('샘플 길이(length of samples)')
plt.ylabel('샘플 수(number of samples)')
plt.show()

In [None]:
import gensim      # gensim은 파이썬에서 Word2Vec을 지원한다.
gensim.__version__
from gensim.models import Word2Vec

In [None]:
embedding_dim = 100

model = Word2Vec(
    sentences = tokenized_data,  vector_size = embedding_dim,
    window = 5, min_count = 5, workers = 4, sg = 0  )   # sg=0은 CBOW, sg=1은 Skip-gram

In [None]:
word_vectors = model.wv
vocabs = list(word_vectors.key_to_index.keys())

In [None]:
for sim_word in model.wv.most_similar("배우"):
    print(sim_word)

In [None]:
print(model.wv.similarity('슬픔', '눈물'))

In [None]:
import numpy as np
from sklearn.manifold import TSNE

In [None]:
word_vector_list = [word_vectors[word] for word in vocabs]

In [None]:
tsne = TSNE(learning_rate = 100)
word_vector_list = np.array(word_vector_list)
transformed = tsne.fit_transform(word_vector_list)

x_axis_tsne = transformed[:, 0]
y_axis_tsne = transformed[:, 1]

def plot_tsne_graph(vocabs, x_asix, y_asix):
  plt.figure(figsize=(30, 30))
  plt.scatter(x_asix, y_asix, marker = 'o')
  for i, v in enumerate(vocabs):
    plt.annotate(v, xy=(x_asix[i], y_asix[i]))

plot_tsne_graph(vocabs, x_axis_tsne, y_axis_tsne)
