## 감정사전을 이용한 감정 분석
- 화장품 특성 단어 --> 수작업으로 사전 구축
- 감정사전을 활용한 리뷰 레이블 태깅
- Modeling
    - 나이브 베이즈 모형
    - doc2vec
        - 모델 평가: Logistic Regression

In [129]:
from utility import *
import pandas as pd
import numpy as np
from konlpy.tag import Twitter
from time import time
import pickle
import os

### 화장품 특성 단어 --> 수작업으로 감정 단어 사전 구축

In [34]:
# 감정 단어 사전 로드
def read_data_comma(filename):
    with open(filename, 'r') as f:
        data = [line.split(', ') for line in f.read().splitlines()]
        data = data[1:]
    return data

In [46]:
words_dic = read_data_comma('data/cosmetic_words_labled.txt')

In [175]:
words_dic[:10]

[['좋다/Adjective', '1'],
 ['있다/Adjective', '1'],
 ['같다/Adjective', '1'],
 ['바르다/Verb', '1'],
 ['향/Noun', '1'],
 ['않다/Verb', '0'],
 ['없다/Adjective', '0'],
 ['자다/Verb', '1'],
 ['너무/Noun', '0'],
 ['때/Noun', '0']]

In [45]:
words_dic[0][0] == '대비/Noun'

False

In [48]:
def pos_neg_dic(words_dic):
    pos_dic = []
    neg_dic = []
    for word in words_dic:
        if word[1] == '1':
            pos_dic.append(word)
        else:
            neg_dic.append(word)
    return pos_dic, neg_dic 

In [49]:
pos_dic, neg_dic = pos_neg_dic(words_dic)

In [51]:
len(pos_dic), len(neg_dic)

(286, 253)

In [57]:
pos_dic[0][0], neg_dic[0][0]

('좋다/Adjective', '않다/Verb')

In [65]:
pos_dic = [word[0] for word in pos_dic]
neg_dic = [word[0] for word in neg_dic]    

In [4]:
review_docs = load_pickle('data/review_docs.pickle')

In [60]:
len(review_docs)

21222

In [62]:
review_docs[100]

['꾸다/Verb',
 '덕이/Noun',
 '거나/Noun',
 '끈/Noun',
 '적임/Noun',
 '이/Josa',
 '없다/Adjective',
 '좋다/Adjective',
 '템/Noun',
 '이지만/Josa',
 '바다/Noun',
 '가다/Verb',
 '팔다리/Noun',
 '에/Josa',
 '몇번/Noun',
 '바르다/Verb',
 '나니/Noun',
 '사라지다/Verb',
 '버리다/Verb',
 '양은/Noun',
 '적어도/Adverb',
 '재다/Verb',
 '매다/Verb',
 '할꾸얌/Noun']

### 감정사전을 활용한 리뷰 레이블 태깅

In [76]:
polarity_list = []
for review in review_docs:
    pos = 0
    neg = 0
    for word in review:
        if word in pos_dic:
            pos += 1
        elif word in neg_dic:
            neg += 1
            
    if (pos - neg) == 0:
        pol = 0
    else:
        pol = (pos - neg)/(pos + neg)
    polarity_list.append(pol)

In [77]:
len(polarity_list)

21222

In [113]:
pos_n = 0
neg_n = 0
sentiment = []
for pol in polarity_list:
    if pol > 0:
        pos_n += 1
        sentiment.append(1)
    else:
        neg_n += 1
        sentiment.append(0)

In [118]:
pos_n, neg_n, (pos_n+neg_n)

(13079, 8143, 2360)

In [121]:
polarity_list[:10]

[0,
 1.0,
 -0.5384615384615384,
 0.42857142857142855,
 -0.5,
 -0.09090909090909091,
 0.8461538461538461,
 0.14285714285714285,
 0.2727272727272727,
 0.5]

In [122]:
sentiment[:10]

[0, 1, 0, 1, 0, 0, 1, 1, 1, 1]

In [123]:
cosmetic = pd.read_csv('data/cosmetics_reviews_final.csv')
reviews = cosmetic['review']
review_labled = pd.DataFrame(columns=['review', 'sentiment'])
review_labled['review'] = reviews
review_labled['sentiment'] = sentiment

In [128]:
review_labled.head(20)

Unnamed: 0,review,sentiment
0,트리플어피치 쓰는데 음 발색도 별로 텍스쳐가 정말 뭐라하지 너무 기름진데 촉촉...,0
1,치크로만 쓰는데 굉장히 여리여리라게 발색됨,1
2,색은 둘째치고 틴트가 지속력이 없음 올려도 지워지니 살 필요가 없다 립브러쉬 쓰...,0
3,루즈G 립스틱이랑 세트인 컴팩트 일단 컴팩트 디자인이 엄청 다양해요 심플한 검...,1
4,164 쓰는데 고급진 핑크에요 발림성이 너무 맘에 들었어요 지속은 좀 아쉬웠고 ...,0
5,원래 립스틱타입 쓰다가 매장언니 추천으로 샀는데 이가격에도 쓰게되는 애증의 아이템임...,0
6,여름이라 팔꿈치랑 복숭아뼈 같은부위는 되게 자주 스크럽하는 편인데 꽤 깨끗하게 잘 ...,1
7,케이스 이뻐서 샀는데 너무 촉촉함 지속력이 너무 없다 그래도 꿋꿋이 발랐었어요,1
8,상큼한 레몬향 촉촉하고 산뜻하게 보습이 돼요 겨울에는 좀 부족할 듯 다른 립밤들보다...,1
9,케이스 귀엽 향은 진짜 좋음 이것만 바르면 목이 따갑고 먼지 먹은 느낌 정말 싫음,1


In [134]:
review_labled.to_csv('data/review_labled.txt', sep=',', index=False, encoding='utf-8')

## Modeling

In [138]:
import codecs
with codecs.open("data/review_labled.txt", encoding='utf-8') as f:
    data = [line.split(',') for line in f.read().splitlines()]
    data = data[1:]   # header 제외

In [140]:
from pprint import pprint
pprint(data[1])

['치크로만 쓰는데 굉장히 여리여리라게 발색됨', '1']


In [141]:
X = list(zip(*data))[0]
y = np.array(list(zip(*data))[1], dtype=int)

##### 학습용 데이터와 테스트용 데이터 분할

In [147]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state = 2018)

In [150]:
X_test[:10]

['자극 장난아님 겁나 뜨거움 그리고 때처럼 밀어내는거라 민감성은 안쓰는게 좋다고 봄  바디용으로 쓰다가 그것도 성질나서 버림',
 '저는 친구한테 추천받아서 샀어요  일단 매트한 질감이 아니라 몰랑몰랑 굳은 연고 같아요 그래서 발림성은 최고   일단  애교살에  발라주기 짱인거 같아요 부담스러운 굵은 입자에 반짝이가 아니라서 눈밑에 바르면 은은하게 반짝거려요   또 섀도우로는 기본 눈 메이컵 다  하고  눈두덩이 중간부분에 포인트로 펄감을 주기위해  살짝 올리는것도 이쁘더라구요  또 양이 많아서   오랜기간동안 써도 안줄어요     아참  붓으로 바르는것보단 손으로 바르는게 훨씬 좋아요   음 아쉬운건 조금  뭉침이 있다는거',
 '루나솔 발색도 좋고 지속성도 좋아요  여러색을 한번에 사용할수있는 장점이 있어요  디올쓰다가 바꿨어요  펄 입자도 크지않고 은은해서 평소 사용하기좋고 색상도 자주사용하는 색으로 모아놔서 굿굿 저는 이것만 써요',
 '처음 쓸 때에는 향이 너무 이상해서 코를 막고 썼는데 쓰다 보니 익숙해진다  오히려 좋은 것 같기도 하고  보습이 잘 되면서 흡수가 빨라 끈적이지 않는다  내가 싫어하는 물감 용기에 물감 뚜껑인 게 아쉽다',
 '약간의 끈적임이 있는 콧물 형태지만 얼굴에 바르고 톡톡 두드려주다보면 금방 스며들어요 엄마에게 선물해드렸는데 좋아하셨어요',
 '별로    드라마틱한 효과 전혀없음 그래도 5일인데',
 '유분이 많아서 저한테는 좀 별로였던거 같아요',
 '샴페인 색상 쓰는데 언더에 바르기 좋음 반짝반짝',
 '마른머리에 웨이브 1도 안잡아줘서 조금 젖은 머리에 하니까 웨이브 약간 잡혀용',
 '하이류']

In [151]:
y_test[:10]

array([0, 1, 1, 1, 1, 0, 1, 1, 0, 0])

### 나이브 베이즈 모형
##### - CountVectorizer

In [142]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

model1 = Pipeline([
            ('vect', CountVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [152]:
%%time 
model1.fit(X_train, y_train)

CPU times: user 793 ms, sys: 36 ms, total: 829 ms
Wall time: 834 ms


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [153]:
print(classification_report(y_test, model1.predict(X_test)))

             precision    recall  f1-score   support

          0       0.86      0.54      0.66      1619
          1       0.77      0.95      0.85      2626

avg / total       0.80      0.79      0.78      4245



##### - TF-IDF 방법 사용 : TfidfVectorizer

In [154]:
from sklearn.feature_extraction.text import TfidfVectorizer

model2 = Pipeline([
            ('vect', TfidfVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [159]:
%%time 
model2.fit(X_train, y_train)

CPU times: user 807 ms, sys: 30.2 ms, total: 837 ms
Wall time: 839 ms


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [160]:
print(classification_report(y_test, model2.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      0.23      0.37      1619
          1       0.68      1.00      0.81      2626

avg / total       0.79      0.70      0.64      4245



##### - Twitter 형태소 분석기 사용  

In [157]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [158]:
model3 = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos)), 
            ('mb', MultinomialNB()),
        ])

In [161]:
%%time 
model3.fit(X_train, y_train)

CPU times: user 1min 5s, sys: 979 ms, total: 1min 6s
Wall time: 57.7 s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_pos at 0x130338bf8>, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [162]:
print(classification_report(y_test, model3.predict(X_test)))

             precision    recall  f1-score   support

          0       0.81      0.72      0.76      1619
          1       0.84      0.90      0.87      2626

avg / total       0.83      0.83      0.83      4245



In [163]:
model4 = Pipeline([
            ('vect', TfidfVectorizer(tokenizer=tokenize_pos)), 
            ('mb', MultinomialNB()),
        ])

In [164]:
%%time 
model4.fit(X_train, y_train)

CPU times: user 54.6 s, sys: 556 ms, total: 55.1 s
Wall time: 55.9 s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [165]:
print(classification_report(y_test, model4.predict(X_test)))

             precision    recall  f1-score   support

          0       0.95      0.27      0.42      1619
          1       0.69      0.99      0.81      2626

avg / total       0.79      0.72      0.66      4245



##### - (1,2)-gram 사용 

In [166]:
model5 = Pipeline([
            ('vect', TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2))), 
            ('mb', MultinomialNB()),
        ])

In [167]:
%%time 
model5.fit(X_train, y_train)

CPU times: user 53.1 s, sys: 490 ms, total: 53.5 s
Wall time: 53.4 s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [168]:
print(classification_report(y_test, model5.predict(X_test)))

             precision    recall  f1-score   support

          0       1.00      0.12      0.21      1619
          1       0.65      1.00      0.79      2626

avg / total       0.78      0.66      0.57      4245



In [169]:
model6 = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2))), 
            ('mb', MultinomialNB()),
        ])

In [170]:
%%time 
model6.fit(X_train, y_train)

CPU times: user 54.6 s, sys: 521 ms, total: 55.2 s
Wall time: 55.7 s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_pos at 0x130338bf8>, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [171]:
print(classification_report(y_test, model6.predict(X_test)))

             precision    recall  f1-score   support

          0       0.93      0.60      0.73      1619
          1       0.80      0.97      0.88      2626

avg / total       0.85      0.83      0.82      4245



##### --> model6 (CountVectorizer, 형태소분석, (1,2)-gram 사용)의 성능이 가장 좋음.

### doc2vec

In [176]:
from collections import namedtuple
TaggedDocument = namedtuple('TaggedDocument', 'words tags')

In [186]:
train_set, test_set = train_test_split(data, test_size=0.25, random_state=2018)

In [187]:
print(len(train_set))      # nrows: 150000
print(len(test_set))       # nrows: 50000

15916
5306


In [188]:
train_docs = [(tokenize(row[0]), row[1]) for row in train_set]
test_docs = [(tokenize(row[0]), row[1]) for row in test_set]

In [191]:
len(train_docs), len(test_docs)

(15916, 5306)

In [197]:
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in test_docs]

In [192]:
# 병렬 연산 처리
import multiprocessing
cores = multiprocessing.cpu_count()
cores

4

In [193]:
# 로그값 출력
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [195]:
# 토큰 벡터화(word embedding) 하기
from gensim.models import doc2vec

doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=8,        # distance between the predicted word and context words
    size=300,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=20,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=cores,   # multi cpu
    hs = 1,          # hierarchical softmax / default 0
    negative = 10,   # negative sampling / default 5
)

2018-08-01 13:28:16,793 : INFO : 'pattern' package not found; tag filters are not available for English


In [198]:
# 토큰 로드
doc_vectorizer.build_vocab(tagged_train_docs)
str(doc_vectorizer)

2018-08-01 13:32:24,389 : INFO : collecting all words and their counts
2018-08-01 13:32:24,391 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-08-01 13:32:24,637 : INFO : PROGRESS: at example #10000, processed 393403 words (1603857/s), 12912 word types, 2 tags
2018-08-01 13:32:24,739 : INFO : collected 15708 word types and 2 unique tags from a corpus of 15916 examples and 632490 words
2018-08-01 13:32:24,740 : INFO : Loading a fresh vocabulary
2018-08-01 13:32:24,986 : INFO : effective_min_count=20 retains 2462 unique words (15% of original 15708, drops 13246)
2018-08-01 13:32:24,987 : INFO : effective_min_count=20 leaves 587746 word corpus (92% of original 632490, drops 44744)
2018-08-01 13:32:24,997 : INFO : deleting the raw counts dictionary of 15708 items
2018-08-01 13:32:24,999 : INFO : sample=0.001 downsamples 59 most-common words
2018-08-01 13:32:24,999 : INFO : downsampling leaves estimated 470187 word corpus (80.0% of prior 587746)
2018-08-

'Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4)'

In [201]:
# 벡터화를 하기 위한 파라미터 확인
doc_vectorizer.corpus_count, doc_vectorizer.iter

  """Entry point for launching an IPython kernel.


(15916, 5)

In [202]:
# 벡터 문서 학습
start = time()
for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay
end = time()
print("During Time: {}".format(end-start))

  after removing the cwd from sys.path.
2018-08-01 13:36:14,828 : INFO : training model with 4 workers on 2462 vocabulary and 300 features, using sg=1 hs=1 sample=0.001 negative=10 window=8
2018-08-01 13:36:16,168 : INFO : EPOCH 1 - PROGRESS: at 8.06% examples, 28769 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:36:17,280 : INFO : EPOCH 1 - PROGRESS: at 20.65% examples, 40818 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:36:18,530 : INFO : EPOCH 1 - PROGRESS: at 33.35% examples, 43663 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:36:19,922 : INFO : EPOCH 1 - PROGRESS: at 45.97% examples, 43742 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:36:21,051 : INFO : EPOCH 1 - PROGRESS: at 58.87% examples, 45695 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:36:22,204 : INFO : EPOCH 1 - PROGRESS: at 71.41% examples, 46832 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:36:23,346 : INFO : EPOCH 1 - PROGRESS: at 83.56% examples, 47738 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:36:24,514 : I

2018-08-01 13:37:15,242 : INFO : EPOCH 1 - PROGRESS: at 33.35% examples, 31515 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:37:16,322 : INFO : EPOCH 1 - PROGRESS: at 39.74% examples, 30977 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:37:17,482 : INFO : EPOCH 1 - PROGRESS: at 45.97% examples, 30260 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:37:18,490 : INFO : EPOCH 1 - PROGRESS: at 55.57% examples, 32085 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:37:19,797 : INFO : EPOCH 1 - PROGRESS: at 65.34% examples, 32510 words/s, in_qsize 8, out_qsize 1
2018-08-01 13:37:21,083 : INFO : EPOCH 1 - PROGRESS: at 77.53% examples, 34266 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:37:22,319 : INFO : EPOCH 1 - PROGRESS: at 89.59% examples, 35780 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:37:22,717 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-08-01 13:37:22,946 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-08-01 13:37:22,988 : INFO : w

2018-08-01 13:38:16,609 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-08-01 13:38:16,714 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-08-01 13:38:16,715 : INFO : EPOCH - 1 : training on 632490 raw words (485899 effective words) took 10.4s, 46523 effective words/s
2018-08-01 13:38:17,973 : INFO : EPOCH 2 - PROGRESS: at 8.00% examples, 30640 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:38:19,323 : INFO : EPOCH 2 - PROGRESS: at 20.75% examples, 38354 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:38:20,618 : INFO : EPOCH 2 - PROGRESS: at 33.35% examples, 41398 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:38:21,917 : INFO : EPOCH 2 - PROGRESS: at 45.97% examples, 42799 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:38:23,130 : INFO : EPOCH 2 - PROGRESS: at 58.87% examples, 44273 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:38:24,355 : INFO : EPOCH 2 - PROGRESS: at 71.41% examples, 45182 words/s, in_qsize 8, out_qsize 0
2018-08-01 1

2018-08-01 13:39:15,938 : INFO : EPOCH 2 - PROGRESS: at 83.60% examples, 47154 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:39:16,952 : INFO : EPOCH 2 - PROGRESS: at 95.33% examples, 48145 words/s, in_qsize 3, out_qsize 1
2018-08-01 13:39:16,956 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-08-01 13:39:17,199 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-08-01 13:39:17,206 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-08-01 13:39:17,222 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-08-01 13:39:17,223 : INFO : EPOCH - 2 : training on 632490 raw words (486060 effective words) took 9.9s, 49133 effective words/s
2018-08-01 13:39:18,436 : INFO : EPOCH 3 - PROGRESS: at 7.92% examples, 31761 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:39:19,719 : INFO : EPOCH 3 - PROGRESS: at 20.65% examples, 40123 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:39:20,892 : INFO : EPOCH 3 - PROGRESS: at

2018-08-01 13:40:12,979 : INFO : EPOCH 3 - PROGRESS: at 45.97% examples, 45472 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:40:14,230 : INFO : EPOCH 3 - PROGRESS: at 58.87% examples, 46207 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:40:15,462 : INFO : EPOCH 3 - PROGRESS: at 71.41% examples, 46791 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:40:16,694 : INFO : EPOCH 3 - PROGRESS: at 83.56% examples, 47199 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:40:17,858 : INFO : EPOCH 3 - PROGRESS: at 95.33% examples, 47436 words/s, in_qsize 3, out_qsize 1
2018-08-01 13:40:17,859 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-08-01 13:40:17,937 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-08-01 13:40:18,007 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-08-01 13:40:18,025 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-08-01 13:40:18,026 : INFO : EPOCH - 3 : training on 632490 raw words (4

2018-08-01 13:41:08,743 : INFO : EPOCH 4 - PROGRESS: at 8.00% examples, 30799 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:41:10,014 : INFO : EPOCH 4 - PROGRESS: at 20.65% examples, 39781 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:41:11,256 : INFO : EPOCH 4 - PROGRESS: at 33.41% examples, 42944 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:41:12,505 : INFO : EPOCH 4 - PROGRESS: at 45.97% examples, 44448 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:41:13,768 : INFO : EPOCH 4 - PROGRESS: at 58.87% examples, 45266 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:41:14,975 : INFO : EPOCH 4 - PROGRESS: at 71.41% examples, 46128 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:41:16,153 : INFO : EPOCH 4 - PROGRESS: at 83.56% examples, 46910 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:41:17,185 : INFO : EPOCH 4 - PROGRESS: at 95.33% examples, 47826 words/s, in_qsize 3, out_qsize 1
2018-08-01 13:41:17,186 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-08-01 13:

2018-08-01 13:42:07,637 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-08-01 13:42:07,655 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-08-01 13:42:07,671 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-08-01 13:42:07,672 : INFO : EPOCH - 4 : training on 632490 raw words (486062 effective words) took 10.0s, 48705 effective words/s
2018-08-01 13:42:08,917 : INFO : EPOCH 5 - PROGRESS: at 8.00% examples, 30947 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:42:10,158 : INFO : EPOCH 5 - PROGRESS: at 20.61% examples, 40277 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:42:11,389 : INFO : EPOCH 5 - PROGRESS: at 33.35% examples, 43487 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:42:12,554 : INFO : EPOCH 5 - PROGRESS: at 45.97% examples, 45645 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:42:13,786 : INFO : EPOCH 5 - PROGRESS: at 58.87% examples, 46486 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:42:15,027 : INFO : 

2018-08-01 13:43:04,518 : INFO : EPOCH 5 - PROGRESS: at 71.41% examples, 46960 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:43:05,756 : INFO : EPOCH 5 - PROGRESS: at 83.56% examples, 47287 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:43:06,794 : INFO : EPOCH 5 - PROGRESS: at 95.33% examples, 48149 words/s, in_qsize 3, out_qsize 1
2018-08-01 13:43:06,795 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-08-01 13:43:06,975 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-08-01 13:43:07,035 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-08-01 13:43:07,074 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-08-01 13:43:07,076 : INFO : EPOCH - 5 : training on 632490 raw words (486246 effective words) took 9.9s, 49081 effective words/s
2018-08-01 13:43:07,077 : INFO : training on a 3162450 raw words (2430809 effective words) took 49.4s, 49170 effective words/s
2018-08-01 13:43:07,082 : INFO : traini

2018-08-01 13:44:00,001 : INFO : training model with 4 workers on 2462 vocabulary and 300 features, using sg=1 hs=1 sample=0.001 negative=10 window=8
2018-08-01 13:44:01,095 : INFO : EPOCH 1 - PROGRESS: at 8.00% examples, 35353 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:44:02,220 : INFO : EPOCH 1 - PROGRESS: at 20.61% examples, 45179 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:44:03,383 : INFO : EPOCH 1 - PROGRESS: at 33.27% examples, 47816 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:44:04,511 : INFO : EPOCH 1 - PROGRESS: at 45.97% examples, 49430 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:44:05,686 : INFO : EPOCH 1 - PROGRESS: at 58.87% examples, 50021 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:44:06,861 : INFO : EPOCH 1 - PROGRESS: at 71.41% examples, 50388 words/s, in_qsize 8, out_qsize 0
2018-08-01 13:44:07,993 : INFO : EPOCH 1 - PROGRESS: at 83.60% examples, 50875 words/s, in_qsize 7, out_qsize 0
2018-08-01 13:44:08,957 : INFO : worker thread finished; awaiting f

During Time: 519.625382900238


In [207]:
# doc2vec 모델 저장
model_name = 'model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model'
doc_vectorizer.save(model_name)

2018-08-01 14:00:53,424 : INFO : saving Doc2Vec object under model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model, separately None
2018-08-01 14:00:53,636 : INFO : saved model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model


In [208]:
# doc2vec 모델 로드
doc_vectorizer = doc2vec.Doc2Vec.load(model_name)

2018-08-01 14:00:57,792 : INFO : loading Doc2Vec object from model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model
2018-08-01 14:00:57,922 : INFO : loading vocabulary recursively from model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model.vocabulary.* with mmap=None
2018-08-01 14:00:57,927 : INFO : loading trainables recursively from model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model.trainables.* with mmap=None
2018-08-01 14:00:57,929 : INFO : loading wv recursively from model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model.wv.* with mmap=None
2018-08-01 14:00:57,930 : INFO : loading docvecs recursively from model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model.docvecs.* with mmap=None
2018-08-01 14:00:57,931 : INFO : loaded model/Doc2Vec(dbow+w,d300,n10,hs,w8,mc20,s0.001,t4).model


In [210]:
# doc2vec 모델 탐색
print(doc_vectorizer.wv.most_similar('촉촉하다/Adjective'))

[('좋다/Adjective', 0.38780662417411804), ('속/Noun', 0.3300017714500427), ('땡기다/Verb', 0.3296898901462555), ('보습/Noun', 0.3006742596626282), ('도/Josa', 0.30008891224861145), ('촉촉/Noun', 0.2879345417022705), ('은/Josa', 0.2849486470222473), ('가볍다/Adjective', 0.2838784158229828), ('미스트/Noun', 0.27317193150520325), ('향/Noun', 0.26660752296447754)]


  if np.issubdtype(vec.dtype, np.int):


In [211]:
print(doc_vectorizer.wv.most_similar('건조하다/Adjective'))

[('건성/Noun', 0.30199867486953735), ('푸석/Adverb', 0.29866117238998413), ('땡기다/Verb', 0.291866272687912), ('세안/Noun', 0.2816868722438812), ('당기다/Verb', 0.2759491205215454), ('바르다/Verb', 0.26370900869369507), ('입술/Noun', 0.2558313310146332), ('지성/Noun', 0.2551003694534302), ('부족하다/Adjective', 0.2455301135778427), ('피부/Noun', 0.24286693334579468)]


  if np.issubdtype(vec.dtype, np.int):


##### - 모델 평가 : Logistic Regression

In [214]:
X_train = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_train_docs]
y_train = [doc.tags for doc in tagged_train_docs]

In [215]:
X_test = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_test_docs]
y_test = [doc.tags for doc in tagged_test_docs]

In [216]:
len(X_train), len(y_train), len(X_test), len(y_test)

(15916, 15916, 5306, 5306)

In [217]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=2018)

In [218]:
start = time()
clf.fit(X_train, y_train)
end = time()
print('Time: {:f}s'.format(end-start))

  y = column_or_1d(y, warn=True)


Time: 1.059430s


In [220]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
print("테스트 정확도: {:.3f}".format(accuracy_score(y_pred, y_test)))

테스트 정확도: 0.835
