In [None]:
import pandas as pd
df=pd.read_csv("https://raw.githubusercontent.com/hongsukyi/Lectures/main/data/naver_movie_train.txt", sep="\t")

In [None]:
df.head()

In [None]:
print(len(df))

## 편의상 데이터 개수를 줄여주자.
- 150,000개의 10%인 15,000개를 사용하자.

In [None]:
df=df[:1000]
print(len(df))

In [None]:
# 결측치처리
df.replace("", float("NaN"), inplace=True)
df = df.dropna().reset_index(drop=True)
print('결측치 처리 이후:',len(df))

# 중복 제거
df = df.drop_duplicates(['document']).reset_index(drop=True)
print('중복 제거 이후:',len(df))

# 한글이 아닌 문자 제거
df['document'] = df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
print('한글 아닌 문자 제거 이후:',len(df))

#길이가 짧은 데이터 제거
df['document'] = df['document'].apply(lambda x: ' '.join([token for token in x.split() if len(token) > 2]))
print('리뷰 길이가 짧은 것 제거 :',len(df))


In [None]:
!pip install konlpy
from konlpy.tag import Okt

In [None]:
# 불용어 정의
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
df = list(df['document'])

In [None]:
okt = Okt()
df_token = []

In [None]:
for sentence in df:
    t_okt = okt.morphs(sentence, stem=True) # 토큰화
    t_sentence = [word for word in t_okt if not word in stopwords] # 불용어 제거
    df_token.append(t_sentence)

In [None]:
len(df_token)

In [None]:
print(df_token[:5])

In [None]:
import matplotlib.pyplot as plt

In [None]:
print('리뷰의 최대 길이 :',max(len(review) for review in df_token))
print('리뷰의 평균 길이 :',sum(map(len, df_token))/len(df_token))
plt.hist([len(review) for review in df_token], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
import gensim      # gensim은 파이썬에서 Word2Vec을 지원한다.
gensim.__version__
from gensim.models import Word2Vec

In [None]:
embedding_dim = 100

model = Word2Vec(
    sentences = df_token,  vector_size = embedding_dim,
    window = 5, min_count = 5, workers = 4, sg = 0  )   # sg=0은 CBOW, sg=1은 Skip-gram

In [None]:
word_vectors = model.wv
vocabs = list(word_vectors.key_to_index.keys())

In [None]:
for sim_word in model.wv.most_similar("배우"):
    print(sim_word)

In [None]:
print(model.wv.similarity('연기', '눈물'))

In [None]:
import numpy as np
from sklearn.manifold import TSNE

In [None]:
word_vector_list = [word_vectors[word] for word in vocabs]

In [None]:
tsne = TSNE(learning_rate = 100)
word_vector_list = np.array(word_vector_list)
transformed = tsne.fit_transform(word_vector_list)

x_axis_tsne = transformed[:, 0]
y_axis_tsne = transformed[:, 1]

def plot_tsne_graph(vocabs, x_asix, y_asix):
  plt.figure(figsize=(30, 30))
  plt.scatter(x_asix, y_asix, marker = 'o')
  for i, v in enumerate(vocabs):
    plt.annotate(v, xy=(x_asix[i], y_asix[i]))

plot_tsne_graph(vocabs, x_axis_tsne, y_axis_tsne)