# 출처 : https://github.com/dongjun-Lee/transfer-learning-text-tf/

## 하이퍼 파라미터 및 각종 초기화

In [16]:
!pip install wget

import tensorflow as tf
import os
import wget
import tarfile
import re
from nltk.tokenize import word_tokenize
import collections
import pandas as pd
import pickle
import numpy as np
import os

MAX_DOCUMENT_LENGTH = 20
EMBEDDING_SIZE = 256
HIDDEN_UNITS = 128
BATCH_SIZE = 16
NUM_EPOCHS = 2
TRAIN_PATH = "dbpedia_csv/train.csv"
TEST_PATH = "dbpedia_csvtest.csv"
WORD_DICT_PATH = "word_dict.pickle"
TRAIN_PERCENT = 0.2

import nltk
nltk.download('punkt')

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/seonghoonjung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 데이터 유틸리티 함수들

In [18]:
def download_dbpedia():
    dbpedia_url = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz'

    wget.download(dbpedia_url)
    with tarfile.open("dbpedia_csv.tar.gz", "r:gz") as tar:
        tar.extractall()
def clean_str(text):
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`\"]", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.strip().lower()

    return text
def build_word_dict():
    if not os.path.exists(WORD_DICT_PATH):
        train_df = pd.read_csv(TRAIN_PATH, names=["class", "title", "content"])
        contents = train_df["content"]

        words = list()
        for content in contents:
            for word in word_tokenize(clean_str(content)):
                words.append(word)

        word_counter = collections.Counter(words).most_common()
        word_dict = dict()
        word_dict["<pad>"] = 0
        word_dict["<unk>"] = 1
        word_dict["<s>"] = 2
        word_dict["</s>"] = 3
        for word, count in word_counter:
            if count > 1:
                word_dict[word] = len(word_dict)

        with open(WORD_DICT_PATH, "wb") as f:
            pickle.dump(word_dict, f)

    else:
        with open(WORD_DICT_PATH, "rb") as f:
            word_dict = pickle.load(f)
            
    dict_word = {v: k for k, v in word_dict.items()}

    return word_dict,dict_word 
def build_word_dataset(step, frac, word_dict, document_max_len):
    if step == "train":
        df = pd.read_csv(TRAIN_PATH, names=["class", "title", "content"])
        # Shuffle dataframe
        df = df.sample(frac=frac)
    else:
        df = pd.read_csv(TEST_PATH, names=["class", "title", "content"])
        df = df.sample(frac=1.0)
        
    x = list(map(lambda d: word_tokenize(clean_str(d)), df["content"]))
    x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), x))
    x = list(map(lambda d: d[:document_max_len], x))
    x = list(map(lambda d: d + (document_max_len - len(d)) * [word_dict["<pad>"]], x))

    y = list(map(lambda d: d - 1, list(df["class"])))

    return x, y
def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]

## 데이터 확보

In [20]:
if not os.path.exists('dbpedia_csv'):
    print("Downloading dbpedia dataset...")
    download_dbpedia()
    
print('Build word dict')
word_dict, dict_word = build_word_dict()

print('Build word dataset')
train_x, train_y = build_word_dataset("train", TRAIN_PERCENT, word_dict, MAX_DOCUMENT_LENGTH)

Build word dict
Build word dataset


In [21]:
sample_sentence_idx = 0
sentence = ' '.join([dict_word[idx] for idx in train_x[sample_sentence_idx]]) 
print(f'{sentence} -> {train_y[sample_sentence_idx]}')

kruszyn kraje ski kru n kra j ski ( german deutsch <unk> ) is a village in the administrative district -> 8


## AutoEncoder 모델 정의

In [22]:
# I have a dinner => <s> I have a dinner </s>

class AutoEncoder(object):
    def __init__(self, word_dict, max_length, embedding_size, units, batch_size):
        self.embedding_size = embedding_size
        self.voca_size = len(word_dict)
        self.units = units
        
        self.x = tf.placeholder(tf.int32, [None, max_length])
        self.batch_size = batch_size
        
        self.decoder_input = tf.concat([tf.ones([self.batch_size, 1], tf.int32)*word_dict['<s>'],self.x], axis=1)
        self.decoder_output = tf.concat([self.x, tf.ones([self.batch_size,1], tf.int32)*word_dict['</s>']], axis=1)
        
        self.encoder_input_len = tf.reduce_sum(tf.sign(self.x),1)
        self.decoder_input_len = tf.reduce_sum(tf.sign(self.decoder_input),1)
        
        with tf.variable_scope("embedding"):
            init_embeddings = tf.random_uniform([self.voca_size, self.embedding_size])
            embeddings = tf.get_variable("embedding", initializer=init_embeddings)
            encoder_input_emb = tf.nn.embedding_lookup(embeddings, self.x)
            decoder_input_emb = tf.nn.embedding_lookup(embeddings, self.decoder_input)
        
        with tf.variable_scope("rnn"):
            encoder_cell = tf.contrib.rnn.BasicLSTMCell(self.units)
            _, encoder_states = tf.nn.dynamic_rnn(encoder_cell, encoder_input_emb, 
                                                  sequence_length=self.encoder_input_len,
                                                  dtype=tf.float32)
        with tf.variable_scope("decoder"):
            decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.units)
            # shape of decoder outputs  = [batch_size, sequence_length, units]
            decoder_outputs, _ = tf.nn.dynamic_rnn(decoder_cell, decoder_input_emb,
                                                   sequence_length=self.decoder_input_len,
                                                   dtype=tf.float32)
            
        with tf.name_scope("output"):
            # shape = [batch_size, sequence_length, voca_size]
            self.logits = tf.layers.dense(decoder_outputs, self.voca_size)

        with tf.name_scope("loss"):
            # shape = [sequence_length, ]
            losses = tf.contrib.seq2seq.sequence_loss(
                logits = self.logits,
                targets = self.decoder_output,
                weights = tf.sequence_mask(self.decoder_input_len, max_length+1,dtype=tf.float32),
                average_across_timesteps=False,
                average_across_batch=True)

            self.loss = tf.reduce_mean(losses)

tf.reset_default_graph()
model = AutoEncoder(word_dict, MAX_DOCUMENT_LENGTH, EMBEDDING_SIZE, HIDDEN_UNITS, BATCH_SIZE)            

## Autoencoder를 이용한 사전학습

In [None]:
PRETRAIN_SAVE_PATH = "pretrain_logs"
def pre_train(model, train_x, train_y, word_dict):
    with tf.Session() as sess:
        global_step = tf.Variable(0, trainable=False)
        
        params = tf.trainable_variables()
        gradients = tf.gradients(model.loss, params)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step)
        
        loss_summary = tf.summary.scalar("loss", model.loss)
        summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(PRETRAIN_SAVE_PATH, sess.graph)
        
        saver = tf.train.Saver(tf.global_variables())
        
        sess.run(tf.global_variables_initializer())

        def train_step(batch_x):
            feed_dict = {model.x: batch_x}
            _, step, summaries, loss = sess.run([train_op, global_step, summary_op, model.loss],
                                                feed_dict = feed_dict
                                               )
            summary_writer.add_summary(summaries, step)
            return loss
  
        batches = batch_iter(train_x, train_y, BATCH_SIZE, NUM_EPOCHS)
        total_step = ( len(train_x) / BATCH_SIZE ) * NUM_EPOCHS
        for batch_x,_ in batches:
            loss = train_step(batch_x)
            step = tf.train.global_step(sess, global_step)

            if step == 1 or step % 20 == 0:
                print(f'{step}/{total_step}, loss={loss}')
            if step % 100 == 0:
                saver.save(sess, os.path.join(PRETRAIN_SAVE_PATH, "model", "model.ckpt"), global_step=step)

pre_train(model, train_x, train_y, word_dict)            
        