# 출처 : https://github.com/dongjun-Lee/transfer-learning-text-tf/

## 하이퍼 파라미터 및 각종 초기화

In [23]:
!pip install wget

import tensorflow as tf
import os
import wget
import tarfile
import re
from nltk.tokenize import word_tokenize
import collections
import pandas as pd
import pickle
import numpy as np
import os

MAX_DOCUMENT_LENGTH = 20
EMBEDDING_SIZE = 256
HIDDEN_UNITS = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1
NUM_CLASS = 14
TRAIN_PATH = "dbpedia_csv/train.csv"
TEST_PATH = "dbpedia_csv/test.csv"
WORD_DICT_PATH = "word_dict.pickle"
TRAIN_PERCENT = 0.2
PRETRAIN_SAVE_PATH = "pretrain_logs"
TRAIN_SAVE_PATH = "train_logs"

import nltk
nltk.download('punkt')

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/seonghoonjung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 데이터 유틸리티 함수들

In [7]:
def download_dbpedia():
    dbpedia_url = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz'

    wget.download(dbpedia_url)
    with tarfile.open("dbpedia_csv.tar.gz", "r:gz") as tar:
        tar.extractall()
def clean_str(text):
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`\"]", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.strip().lower()

    return text
def build_word_dict():
    if not os.path.exists(WORD_DICT_PATH):
        train_df = pd.read_csv(TRAIN_PATH, names=["class", "title", "content"])
        contents = train_df["content"]

        words = list()
        for content in contents:
            for word in word_tokenize(clean_str(content)):
                words.append(word)

        word_counter = collections.Counter(words).most_common()
        word_dict = dict()
        word_dict["<pad>"] = 0
        word_dict["<unk>"] = 1
        word_dict["<s>"] = 2
        word_dict["</s>"] = 3
        for word, count in word_counter:
            if count > 1:
                word_dict[word] = len(word_dict)

        with open(WORD_DICT_PATH, "wb") as f:
            pickle.dump(word_dict, f)

    else:
        with open(WORD_DICT_PATH, "rb") as f:
            word_dict = pickle.load(f)
            
    dict_word = {v: k for k, v in word_dict.items()}

    return word_dict,dict_word 
def build_word_dataset(step, frac, word_dict, document_max_len):
    if step == "train":
        df = pd.read_csv(TRAIN_PATH, names=["class", "title", "content"])
        # Shuffle dataframe
        df = df.sample(frac=frac)
    else:
        df = pd.read_csv(TEST_PATH, names=["class", "title", "content"])
        df = df.sample(frac=1.0)
        
    x = list(map(lambda d: word_tokenize(clean_str(d)), df["content"]))
    x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), x))
    x = list(map(lambda d: d[:document_max_len], x))
    x = list(map(lambda d: d + (document_max_len - len(d)) * [word_dict["<pad>"]], x))

    y = list(map(lambda d: d - 1, list(df["class"])))

    return x, y
def batch_iter(inputs, outputs, batch_size, num_epochs):
    inputs = np.array(inputs)
    outputs = np.array(outputs)

    num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]

## 데이터 확보

In [14]:
if not os.path.exists('dbpedia_csv'):
    print("Downloading dbpedia dataset...")
    download_dbpedia()
    
print('Build word dict')
word_dict, dict_word = build_word_dict()

print('Build word dataset')
train_x, train_y = build_word_dataset("train", TRAIN_PERCENT, word_dict, MAX_DOCUMENT_LENGTH)
test_x, test_y = build_word_dataset("test", TRAIN_PERCENT, word_dict, MAX_DOCUMENT_LENGTH)

Build word dict
Build word dataset


In [15]:
sample_sentence_idx = 0
sentence = ' '.join([dict_word[idx] for idx in train_x[sample_sentence_idx]]) 
print(f'{sentence} -> {train_y[sample_sentence_idx]}')

onnen is a village in the municipality haren groningen in the netherlands there are some picturesque farms and windmills in -> 8


## Define Model for dbpedia classification

In [28]:
class WordRNN(object):
    def __init__(self, voca_size, embedding_size, units, max_length, num_class):
        self.embedding_size = embedding_size
        self.units = units
        self.fc_units = 256
        
        self.x = tf.placeholder(tf.int32, [None, max_length])
        self.x_len = tf.reduce_sum(tf.sign(self.x), 1)
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_prob = tf.placeholder(tf.float32, [])
        
        with tf.variable_scope("embedding"):
            init_embeddings = tf.random.uniform([voca_size, self.embedding_size])
            embeddings = tf.get_variable("embedding", initializer=init_embeddings)
            x_emb = tf.nn.embedding_lookup(embeddings, self.x)
        
        with tf.variable_scope("rnn"):
            cell = tf.contrib.rnn.BasicLSTMCell(self.units)
            rnn_outputs, _ = tf.nn.dynamic_rnn(
                cell, x_emb, sequence_length=self.x_len, dtype=tf.float32
            )
            rnn_output_flat = tf.reshape(rnn_outputs, [-1, max_length*self.units])
            
        with tf.name_scope("fc"):
            fc_output = tf.layers.dense(rnn_output_flat, self.fc_units, activation=tf.nn.relu)
            dropout = tf.nn.dropout(fc_output, self.keep_prob)
            
        with tf.name_scope("output"):
            self.logits = tf.layers.dense(dropout, num_class)
            self.predictions = tf.argmax(self.logits, -1, output_type=tf.int32)
        
        with tf.name_scope("loss"):
            self.loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y)
            )
        
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, self.y)
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

tf.reset_default_graph()
model = WordRNN(len(word_dict), EMBEDDING_SIZE, 128, MAX_DOCUMENT_LENGTH, NUM_CLASS)

## 이진 분류 학습 함수 정의 

In [None]:
def train(model, train_x, train_y, test_x, test_y, vocabulary_size):
    with tf.Session() as sess:

        # Define training procedure
        global_step = tf.Variable(0, trainable=False)
        params = tf.trainable_variables()
        gradients = tf.gradients(model.loss, params)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        optimizer = tf.train.AdamOptimizer(0.001)
        train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step)

        # Summary
        loss_summary = tf.summary.scalar("loss", model.loss)
        summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(TRAIN_SAVE_PATH, sess.graph)

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        # Load variables from pre-trained model
        pre_trained_variables = [v for v in tf.global_variables()
                                 if (v.name.startswith("embedding") or v.name.startswith("birnn")) and "Adam" not in v.name]
        print(pre_trained_variables)
        saver = tf.train.Saver(pre_trained_variables)
        ckpt = tf.train.get_checkpoint_state(os.path.join(PRETRAIN_SAVE_PATH, "model"))
        saver.restore(sess, ckpt.model_checkpoint_path)

        def train_step(batch_x, batch_y):
            feed_dict = {
                model.x: batch_x,
                model.y: batch_y,
                model.keep_prob: 0.5
            }

            _, step, summaries, loss = sess.run([train_op, global_step, summary_op, model.loss], feed_dict=feed_dict)
            summary_writer.add_summary(summaries, step)

            if step % 100 == 0:
                print("step {0} : loss = {1}".format(step, loss))

        def test_accuracy(test_x, test_y):
            test_batches = batch_iter(test_x, test_y, BATCH_SIZE, 1)
            sum_accuracy, cnt = 0, 0

            for test_batch_x, test_batch_y in test_batches:
                accuracy = sess.run(model.accuracy, feed_dict={model.x: test_batch_x, model.y: test_batch_y, model.keep_prob: 1.0})
                sum_accuracy += accuracy
                cnt += 1

            with open(TRAIN_SAVE_PATH +"-accuracy.txt", "a") as f:
                print(sum_accuracy/cnt, file=f)

            return sum_accuracy / cnt

        # Training loop
        batches = batch_iter(train_x, train_y, BATCH_SIZE, NUM_EPOCHS)

        for batch_x, batch_y in batches:
            train_step(batch_x, batch_y)
            step = tf.train.global_step(sess, global_step)

            if step == 1 or step % 200 == 0:
                test_acc = test_accuracy(test_x, test_y)
                print("test_accuracy = {0}\n".format(test_acc))
train(model, train_x, train_y, test_x, test_y, len(word_dict))

[<tf.Variable 'embedding/embedding:0' shape=(268964, 256) dtype=float32_ref>]
INFO:tensorflow:Restoring parameters from pretrain_logs/model/model.ckpt-100
test_accuracy = 0.09185714285714286

step 100 : loss = 2.7442877292633057
step 200 : loss = 1.6490721702575684
test_accuracy = 0.43442857142857144

step 300 : loss = 1.5403660535812378


In [27]:
inspect_list = tf.train.list_variables(os.path.join(PRETRAIN_SAVE_PATH, "model")) 
inspect_list

[('Variable', []),
 ('Variable_1', []),
 ('beta1_power', []),
 ('beta1_power_1', []),
 ('beta2_power', []),
 ('beta2_power_1', []),
 ('decoder/rnn/basic_lstm_cell/bias', [512]),
 ('decoder/rnn/basic_lstm_cell/bias/Adam', [512]),
 ('decoder/rnn/basic_lstm_cell/bias/Adam_1', [512]),
 ('decoder/rnn/basic_lstm_cell/bias/Adam_2', [512]),
 ('decoder/rnn/basic_lstm_cell/bias/Adam_3', [512]),
 ('decoder/rnn/basic_lstm_cell/kernel', [384, 512]),
 ('decoder/rnn/basic_lstm_cell/kernel/Adam', [384, 512]),
 ('decoder/rnn/basic_lstm_cell/kernel/Adam_1', [384, 512]),
 ('decoder/rnn/basic_lstm_cell/kernel/Adam_2', [384, 512]),
 ('decoder/rnn/basic_lstm_cell/kernel/Adam_3', [384, 512]),
 ('dense/bias', [268964]),
 ('dense/bias/Adam', [268964]),
 ('dense/bias/Adam_1', [268964]),
 ('dense/bias/Adam_2', [268964]),
 ('dense/bias/Adam_3', [268964]),
 ('dense/kernel', [128, 268964]),
 ('dense/kernel/Adam', [128, 268964]),
 ('dense/kernel/Adam_1', [128, 268964]),
 ('dense/kernel/Adam_2', [128, 268964]),
 ('de