# 전처리

In [133]:
import csv
import MeCab
import pandas as pd
import re
from gensim.models import doc2vec

## 전처리를 위한 학습데이터셋 로드

In [97]:
train_docs=pd.read_csv('./out_put1.csv',encoding="cp949")
train_docs=train_docs.dropna()
del train_docs["Unnamed: 0"]

train_docs.tail()

Unnamed: 0,comment,episode_num,sub_title,sympathy_count,title
111285,1등,1,프롤로그,7,열정호구
111286,첫댓!!,1,프롤로그,6,열정호구
111287,오,1,프롤로그,3,열정호구
111288,첫플?,1,프롤로그,4,열정호구
111289,???,1,프롤로그,3,열정호구


In [130]:
pos_ptn=re.compile(r'.+\t[NVM][A-Z]{1,2}') # pos tagging 패턴
remove_ptn=re.compile(r'[^a-zㄱ-힣]') # 불용어 패턴
mecab=MeCab.Tagger()  # mecab 사전 정의


## pos tagging

In [99]:
def pos_tagging(doc):
    doc=remove_ptn.sub(' ', doc)
    parse_doc=mecab.parse(doc)
    pos_doc=pos_ptn.finditer(parse_doc)
    
    return [x[0].replace('\t','\\') for x in pos_doc]

In [100]:
pos_docs = [pos_tagging(doc) for doc in train_docs['comment']]
train_docs['prep_docs'] = pos_docs
train_docs = train_docs.dropna()
idx=[indx for indx in range(len(train_docs)) ]
train_docs["idx"]=idx
train_docs.index = range(len(train_docs))
train_docs.tail()

Unnamed: 0,comment,episode_num,sub_title,sympathy_count,title,prep_docs,idx
111285,1등,1,프롤로그,7,열정호구,[등\NNG],111285
111286,첫댓!!,1,프롤로그,6,열정호구,"[첫\MM, 댓\NR]",111286
111287,오,1,프롤로그,3,열정호구,[오\NR],111287
111288,첫플?,1,프롤로그,4,열정호구,"[첫\MM, 플\NNG]",111288
111289,???,1,프롤로그,3,열정호구,[],111289


In [101]:
train_docs.iloc[10].tolist()

['각혈..?', 63, '62화 다짐', 0, '열정호구 ', ['각혈\\NNG'], 10]

## Dco2vec 입력객체 생성

In [102]:
def doc2vec_labeler(docs):
    labeled_docs= list()
    for i in range(len(docs)):
        doc=docs.iloc[i].tolist()
        words=doc[5]   #prep_docs
        labels=[str(doc[6])]  # 문서번호 idx
        sentence=doc2vec.TaggedDocument(words=words,tags=labels)
        labeled_docs.append(sentence)
    return labeled_docs

In [103]:
train_labeled_docs=doc2vec_labeler(train_docs)

In [104]:
train_labeled_docs[100]

TaggedDocument(words=['망\\NNG', '미안\\NNG', '널\\NP', '잊\\VV', '있\\VX'], tags=['100'])

## 모델 하이퍼파라미터 설정 및 학습

- DBOW 모델 사용

In [137]:
model.iter

5

In [139]:
model=doc2vec.Doc2Vec(window=5,size=100,alpha=0.1,min_alpha=0.01,dm=0,
                     min_count=3,workers=4)
model.build_vocab(train_labeled_docs)

In [141]:
model.train(train_labeled_docs,total_examples=model.corpus_count,epochs=10)

11833324

## 학습결과 확인

### 단어 유사도 확인

- word order를 무시한것이라 단어벡터는 직관과 어긋나는 결과를 보임

In [142]:
model.wv.most_similar('좁쌀\\NNG')

[('쥬\\NNP', 0.40914279222488403),
 ('엉덩이\\NNG', 0.40528446435928345),
 ('뻥\\MAG', 0.39052295684814453),
 ('초래\\NNG', 0.35559284687042236),
 ('한\\NNP', 0.33940592408180237),
 ('대할\\VV', 0.3247053027153015),
 ('들으니\\VV', 0.32371944189071655),
 ('한발\\NNG', 0.32245540618896484),
 ('비례\\NNG', 0.32105886936187744),
 ('상의\\NNG', 0.3195545971393585)]

## 테스트 데이터셋 로드 및 pos tagging

In [108]:
test_docs=train_docs[:800]
idx=[indx for indx in range(len(test_docs)) ]
test_docs["idx"]=idx
test_docs.index=range(len(test_docs))
test_docs.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,comment,episode_num,sub_title,sympathy_count,title,prep_docs,idx
795,분량?,64,63화 5개월,0,열정호구,[분량\NNG],795
796,외주가 문제냐 병원 좀 가라 하시는데\n병원비는 어쩔 거고 병원 가느라 외주 못 해...,64,63화 5개월,1,열정호구,"[외주\NNG, 문제\NNG, 냐\VCP, 병원\NNG, 좀\MAG, 가\VV, 하...",796
797,결핵은 아니것지 설마 ㅜㅜ,64,63화 5개월,0,열정호구,"[결핵\NNG, 아니\VCN, 설마\MAG]",797
798,그놈의 돈 시바.. 난 진짜 부자 아니면 자식 안가져야지,64,63화 5개월,0,열정호구,"[그놈\NP, 돈\NNG, 시바\NNG, 난\NP, 진짜\MAG, 부자\NNG, 아...",798
799,너 갑자기 왜죽냐?,64,63화 5개월,0,열정호구,"[너\NP, 갑자기\MAG, 왜\MAG, 죽\VV]",799


In [128]:
def get_pred_result(doc,train_docs,topn=5,verbose=False):
    doc_content=doc[5]
    doc_label=doc[6]
    doc_episode=doc[1]
    infer_vector=model.infer_vector(doc_content)
    
    
    similar_res = [x for x in model.docvecs.most_similar([infer_vector], topn=topn)]
    similar_ids = [x[0] for x in similar_res]
    similar_scores=[x[1] for x in similar_res]
    similar_docs=train_docs[train_docs['idx'].isin(similar_ids)][["comment", "episode_num"]]
    similar_docs["score"]=similar_scores
    
    if verbose:
        print('input content:', doc[0],'\nlabel:', doc_label)
        similar_docs.columns = ['most similar documents', 'episode', 'similarity score']
        return similar_docs

In [129]:
get_pred_result(test_docs.iloc[650],train_docs,verbose=True)

input content: 이름이 생망이라고 진짜 생망하게 하지 말았으면 좋겠다 
label: 650


Unnamed: 0,most similar documents,episode,similarity score
650,이름이 생망이라고 진짜 생망하게 하지 말았으면 좋겠다,64,0.799823
101281,야 은지작가가 그렇게좋으면 결혼해 이 멍멍이자슥아1,3,0.780192
101373,니 은지 좋아하냐?,3,0.776412
102428,편집자 은지 좋아함?ㅋㅋㅋ은지씨 하면서 볼이 붉어져ㅋㅋㅋ,3,0.773035
104620,편집장 은지 좋아한다에 1표,4,0.771794
