# 스크래핑한 데이터를 인공지능이 학습하도록 전처리합니다.

설치시 필요한 것
- konlpy
- gensim


In [1]:
from konlpy.tag import Twitter
from os import makedirs
import gensim
import tensorflow as tf
import codecs
import os


import numpy as np
import pandas as pd



In [2]:
TRAIN_DATA = "../data/ratings_train.txt"
TEST_DATA = "../data/ratings_test.txt"
MODEL_DATA = "../data/word2vec.model"
WORD2VEC_PATH = "../data/word2vec"

In [None]:
def read_data(filename):    
    with open(filename, 'r',encoding='utf-8') as fp:
        data = [line.split('\t') for line in fp.read().splitlines()]        
        data = data[1:]   # header 제외 #    
    return data

In [None]:
train_data = read_data(TRAIN_DATA)
test_data = read_data(TEST_DATA)

# 1. 품사 태깅

In [None]:
pos_tagger = Twitter()

def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

## id, review, label

In [None]:
train_data[1]

## 품사 태깅 (Twitter)

In [None]:
tokens = [tokenize(row[1]) for row in train_data]
tokens[:2]

# 2. 워드 임베딩
 - min_alpha : training 됨에 따라 줄어든 learning_rate
 - alpha : initial learning rate
 - learning_rate decay 해줌
 

In [None]:
model = gensim.models.Word2Vec(size=300, sg = 1, alpha=0.025, min_alpha=0.025, seed=1234)
model.build_vocab(tokens)

for epoch in range(30):           
    model.train(tokens,model.corpus_count,epochs = model.iter)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

In [None]:
model.save(MODEL_DATA)
model.most_similar('영화/Noun',topn = 20)  ## topn = len(model.wv.vocab)

# 3. 결과 확인 (Tensor board 로 보기)

In [3]:
from tensorflow.contrib.tensorboard.plugins import projector

In [4]:
view_model = gensim.models.Word2Vec.load(MODEL_DATA)
view_model.most_similar('영화/Noun', topn = 20)

  


[('../Punctuation', 0.9061493873596191),
 ('하다/Verb', 0.8798155784606934),
 ('이/Josa', 0.8764169216156006),
 ('없다/Adjective', 0.8203310966491699),
 ('들/Suffix', 0.807205080986023),
 ('판타지영화/Noun', -0.31531190872192383),
 ('뮤지컬영화/Noun', -0.3252159059047699),
 ('멜로영화/Noun', -0.3374733328819275),
 ('남아돌다/Verb', -0.33805131912231445),
 ('5~6/Number', -0.3385058343410492),
 ('폰부스/Noun', -0.3392675518989563),
 ('겄/Noun', -0.3432907164096832),
 ('잉여/Noun', -0.3456770181655884),
 ('만인/Noun', -0.35029393434524536),
 ('손해/Noun', -0.3512357473373413),
 ('화만/Noun', -0.35157400369644165),
 ('방학/Noun', -0.35220322012901306),
 ('만화영화/Noun', -0.3524870276451111),
 ('공포물/Noun', -0.352843701839447),
 ('호러영화/Noun', -0.35320329666137695)]

In [5]:
max_size = len(view_model.wv.vocab)-1
w2v = np.zeros((max_size, view_model.layer1_size))

if not os.path.exists('../data/word2vec'):
    makedirs('../data/word2vec')
    
with codecs.open('../data/word2vec/metadata.tsv','w+',encoding='utf8') as fp:
    for i,word in enumerate(view_model.wv.index2word[:max_size]):
        w2v[i] = view_model.wv[word]
        fp.write(word + "\n")

  


In [6]:
sess = tf.InteractiveSession()
##  Create 2D tensor called embedding that holds our embeddings ##  
with tf.device("/cpu:0"):
    embedding = tf.Variable(w2v, trainable = False,  name = 'embedding')   

tf.global_variables_initializer().run() 

path = WORD2VEC_PATH

saver = tf.train.Saver()
writer = tf.summary.FileWriter(path, sess.graph)

In [7]:
## adding into project
config = projector.ProjectorConfig()
embed = config.embeddings.add()
embed.tensor_name = 'embedding'
embed.metadata_path = 'metadata.tsv'

In [8]:
# Specify the width and height of a single thumbnail.
projector.visualize_embeddings(writer, config)
saver.save(sess, path + '/model.ckpt' , global_step=max_size)
## cmd 실행후 -> 1. cd [소스실행경로]\data
##              2. tensorboard --logdir=./word2vec 입력

'../data/word2vec/model.ckpt-15408'