## nltk 영화 감상 corpus 기반 Word2Vec 사용법
### 출처: [데이터 사이언스 스쿨](https://datascienceschool.net/view-notebook/6927b0906f884a67b0da9310d3a581ee/)

### 단어 임베딩을 위한 Corpus 생성

In [1]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ckkim\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [2]:
from nltk.corpus import movie_reviews
sentences = [list(s) for s in movie_reviews.sents()]

In [3]:
movie_reviews.sents()

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.'], ...]

In [5]:
type(movie_reviews.sents())

nltk.corpus.reader.util.ConcatenatedCorpusView

In [6]:
len(sentences)

71532

In [7]:
sentences[10000]

['means', 'so', 'f', '*', '*', '*', 'ing', 'earnest', '.']

In [8]:
movie_reviews.sents()[10000]

['means', 'so', 'f', '*', '*', '*', 'ing', 'earnest', '.']

### 코퍼스를 입력 인수로 하여 Word2Vec 클래스 객체를 생성

In [9]:
from gensim.models.word2vec import Word2Vec

In [11]:
# 트레이닝 과정
%time model = Word2Vec(sentences)

Wall time: 9.4 s


In [12]:
# 트레이닝 종료후 메모리 반환(unload)
model.init_sims(replace=True)

In [13]:
dir(model.wv)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'add',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'get_keras_embedding',
 'get_vector',
 'index2entity',
 'index2word',
 'init_sims',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'log_evaluate_word_pairs',
 'most_similar',
 'most_similar_cosmul',
 'most_similar_to_given',
 'n_similarity',
 'rank',
 'relative_cosine_similarity',
 'save',
 'save_word2vec_for

### 사용가능한 메소드
#### - similarity : 두 단어의 유사도 계산

In [14]:
model.wv.similarity('actor', 'actress')

0.880257

In [15]:
model.wv.similarity('he', 'she')

0.86105645

In [16]:
model.wv.similarity('actor', 'she')

0.28079355

#### - most_similar : 가장 유사한 단어를 출력

In [17]:
model.wv.most_similar("accident")

[('automobile', 0.8803155422210693),
 ('plane', 0.8778402805328369),
 ('prison', 0.8708900213241577),
 ('radio', 0.8586621880531311),
 ('dying', 0.857754111289978),
 ('arrest', 0.8507620096206665),
 ('ball', 0.8472509384155273),
 ('church', 0.8426524996757507),
 ('deserted', 0.8412702083587646),
 ('suicide', 0.840549111366272)]

In [18]:
# she + (actor - actress)
model.wv.most_similar(positive=['she', 'actor'], negative='actress', topn=1)

[('he', 0.3168867230415344)]

## 네이버 영화 감상 코퍼스
raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

In [19]:
import codecs

def read_data(filename):
    with codecs.open(filename, encoding='utf-8', mode='r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

train_data = read_data('data/02_naver_ratings_train.txt')

In [20]:
len(train_data)

150000

In [21]:
print(train_data[:5])

[['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'], ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'], ['10265843', '너무재밓었다그래서보는것을추천한다', '0'], ['9045019', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0'], ['6483659', '사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', '1']]


In [22]:
from konlpy.tag import Okt
tagger = Okt()

def tokenize(doc):
    return ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)]

train_docs = [row[1] for row in train_data]

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [23]:
# 시간이 매우 오래 걸림
%time sentences = [tokenize(d) for d in train_docs]

Wall time: 14min 9s


In [24]:
%time model = Word2Vec(sentences)
model.init_sims(replace=True)

Wall time: 15.3 s


In [25]:
model.wv.similarity(*tokenize(u'배우 여배우'))

0.7423995

In [26]:
model.wv.similarity(*tokenize(u'배우 남자'))

0.26371446

In [27]:
# 남자 + (여배우 - 배우) = 여자
from konlpy.utils import pprint
pprint(model.wv.most_similar(positive=tokenize(
    u'남자 여배우'), negative=tokenize(u'배우'), topn=1))

[('여자/Noun', 0.8521713018417358)]


In [28]:
# 아빠 + (남자 - 여자) = 엄마
pprint(model.wv.most_similar(positive=tokenize(
    u'아빠 남자'), negative=tokenize(u'여자'), topn=1))

[('엄마/Noun', 0.8817458152770996)]
