## nltk 영화 감상 corpus 기반 Word2Vec 사용법

### 단어 임베딩을 위한 Corpus 생성

In [1]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ckkim\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
from nltk.corpus import movie_reviews
sentences = [list(s) for s in movie_reviews.sents()]

In [10]:
dir(movie_reviews)

['CorpusView',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_add',
 '_c2f',
 '_delimiter',
 '_encoding',
 '_f2c',
 '_file',
 '_fileids',
 '_get_root',
 '_init',
 '_map',
 '_para_block_reader',
 '_pattern',
 '_read_para_block',
 '_read_sent_block',
 '_read_word_block',
 '_resolve',
 '_root',
 '_sent_tokenizer',
 '_tagset',
 '_unload',
 '_word_tokenizer',
 'abspath',
 'abspaths',
 'categories',
 'citation',
 'encoding',
 'ensure_loaded',
 'fileids',
 'license',
 'open',
 'paras',
 'raw',
 'readme',
 'root',
 'sents',
 'unicode_repr',
 'words']

In [26]:
movie_reviews.sents()

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.'], ...]

In [27]:
len(movie_reviews.sents())

71532

In [24]:
len(sentences)

71532

In [28]:
sentences[10000]

['means', 'so', 'f', '*', '*', '*', 'ing', 'earnest', '.']

In [29]:
movie_reviews.sents()[10000]

['means', 'so', 'f', '*', '*', '*', 'ing', 'earnest', '.']

### 코퍼스를 입력 인수로 하여 Word2Vec 클래스 객체를 생성

In [31]:
from gensim.models.word2vec import Word2Vec

In [37]:
# 트레이닝 과정
%%time
model = Word2Vec(sentences)

Wall time: 9.82 s


In [38]:
# 트레이닝 종료후 메모리 반환(unload)
model.init_sims(replace=True)

In [44]:
dir(model.wv)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'add',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'get_keras_embedding',
 'get_vector',
 'index2entity',
 'index2word',
 'init_sims',
 'load',
 'load_word2vec_format',
 'log_accuracy',
 'log_evaluate_word_pairs',
 'most_similar',
 'most_similar_cosmul',
 'most_similar_to_given',
 'n_similarity',
 'rank',
 'relative_cosine_similarity',
 'save',
 'save_word2vec_for

### 사용가능한 메소드
#### - similarity : 두 단어의 유사도 계산

In [39]:
model.wv.similarity('actor', 'actress')

0.85201997

In [40]:
model.wv.similarity('he', 'she')

0.8603546

In [41]:
model.wv.similarity('actor', 'she')

0.24902333

#### - most_similar : 가장 유사한 단어를 출력

In [42]:
model.wv.most_similar("accident")

[('prison', 0.8606554269790649),
 ('investigation', 0.8370742797851562),
 ('criminal', 0.8351860046386719),
 ('evening', 0.831620991230011),
 ('automobile', 0.8315479755401611),
 ('orphaned', 0.8310778141021729),
 ('abandoned', 0.8300754427909851),
 ('egg', 0.8284278512001038),
 ('radio', 0.8260754346847534),
 ('affair', 0.8249633312225342)]

In [45]:
# she + (actor - actress)
model.wv.most_similar(positive=['she', 'actor'], negative='actress', topn=1)

[('he', 0.30622610449790955)]

## 네이버 영화 감상 코퍼스
raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

In [46]:
import codecs

def read_data(filename):
    with codecs.open(filename, encoding='utf-8', mode='r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

train_data = read_data('data/naver_ratings_train.txt')

In [49]:
len(train_data)

150000

In [50]:
print(train_data[:5])

[['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'], ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'], ['10265843', '너무재밓었다그래서보는것을추천한다', '0'], ['9045019', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0'], ['6483659', '사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', '1']]


In [51]:
from konlpy.tag import Okt
tagger = Okt()

def tokenize(doc):
    return ['/'.join(t) for t in tagger.pos(doc, norm=True, stem=True)]

train_docs = [row[1] for row in train_data]

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [52]:
%%time

sentences = [tokenize(d) for d in train_docs]

Wall time: 14min 5s


In [54]:
%%time

model = Word2Vec(sentences)
model.init_sims(replace=True)

Wall time: 15.1 s


In [55]:
model.wv.similarity(*tokenize(u'배우 여배우'))

0.7214186

In [56]:
model.wv.similarity(*tokenize(u'배우 남자'))

0.27602446

In [57]:
# 남자 + (여배우 - 배우) = 여자
from konlpy.utils import pprint
pprint(model.wv.most_similar(positive=tokenize(
    u'남자 여배우'), negative=tokenize(u'배우'), topn=1))

[('여자/Noun', 0.8157773017883301)]


In [61]:
# 아빠 + (남자 - 여자) = 엄마
pprint(model.wv.most_similar(positive=tokenize(
    u'아빠 남자'), negative=tokenize(u'여자'), topn=1))

[('엄마/Noun', 0.8785457611083984)]
