In [1]:
import os
import re
import time
import random
import string
import pandas as pd
import numpy as np
import gensim as gs
import tensorflow as tf
import collections as col
from pathlib import Path
from requests_html import HTMLSession

## Crawl data

In [2]:
data_path = "./bed-time-stories.txt"
file = Path(data_path)
session = HTMLSession()

In [3]:
if not file.exists():
    with open(data_path, "w") as outfile:
        for page in range(1, 31):
            r = session.get(f"http://truyencotich.vn/truyen-co-tich/co-tich-viet-nam/page/{page}")

            for link in set([element.attrs["href"] for element in r.html.find("article a")]):
                page = session.get(link)

                try:
                    content = page.html.find("#content .entry-content", first=True).text
                
                    outfile.write(content)
                except:
                    pass

## Functions

In [4]:
def text_preprocessing(text):
    return re.findall(f"[{string.punctuation}]|[\w]+|[\s\t\r\n]", text.lower())

def create_dictionary(file):
    return gs.corpora.Dictionary([text_preprocessing(line) for line in open(file, "r").readlines()])

In [5]:
vocab = create_dictionary("./bed-time-stories.txt")

In [6]:
def vocab_encode(text):
    return vocab.doc2idx(text_preprocessing(text))

def vocab_decode(array):
    return ''.join([vocab.get(idx) for idx in array])

def read_data(filename, window, overlap):
    lines = [line.strip() for line in open(filename, "r").readlines()]

    while True:
        random.shuffle(lines)

        for line in lines:
            words = vocab_encode(line)
            
            for start in range(0, len(words) - window, overlap):
                chunk = words[start: start + window]

                yield chunk

def read_batch(stream, batch_size):
    batch = []
    for element in stream:
        batch.append(element)
        if len(batch) == batch_size:
            yield batch
            batch = []
    yield batch

In [7]:
vocab_encode("Ngày xửa ngày xưa")

[25, 1, 37, 1, 25, 1, 36]

In [8]:
vocab_decode([24, 0, 36, 0, 24, 0, 35])

'mực\nxưa\nmực\nvợ'

In [9]:
i = 0

for batch in read_batch(read_data("./bed-time-stories.txt", 10, 5), 2):
    if i == 5:
        break

    print([vocab_decode(words) for words in batch])

    i += 1

['có hai anh em: anh', ' em: anh thì giàu ']
[' thì giàu có, còn ', 'có, còn em nghèo xác']
['em nghèo xác xơ. nhà', ' xơ. nhà người em ']
[' người em có người đầy', 'có người đầy tớ mưu ']
[' tớ mưu trí. trong ', 'trí. trong nhà chỉ toàn']


## Hyperparameter

In [10]:
hidden_sizes = [128, 256]
batch_size = 64
learning_rate = 0.01
skip = 1
num_steps = 100 # for RNN unroled
len_generated = 200

## Model

In [11]:
class RNN(object):
    def __init__(self, model):
        self.model = model
        self.path = f"{self.model}.txt"

        self.seq = tf.placeholder(tf.int32, [None, None], name='seq')
        self.temp = tf.constant(1.5, name='temp')
        self.gstep = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

    def create_rnn(self, seq):
        layers = [tf.nn.rnn_cell.GRUCell(size) for size in hidden_sizes]
        cells = tf.nn.rnn_cell.MultiRNNCell(layers)
        
        batch = tf.shape(seq)[0]
        
        zero_states = cells.zero_state(batch, dtype=tf.float32)
        
        self.in_state = tuple([
            tf.placeholder_with_default(state, [None, state.shape[1]])
            for state in zero_states
        ])

        # this line to calculate the real length of seq
        # all seq are padded to be of the same length, which is num_steps
        length = tf.reduce_sum(tf.reduce_max(tf.sign(seq), 2), 1)
        self.output, self.out_state = tf.nn.dynamic_rnn(cells, seq, length, self.in_state)

    def create_model(self):
        seq = tf.one_hot(self.seq, len(vocab))

        self.create_rnn(seq)

        self.logits = tf.layers.dense(self.output, len(vocab))
        
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=self.logits[:, :-1], labels=seq[:, 1:]
        )

        self.loss = tf.reduce_sum(loss)

        # sample the next character from Maxwell-Boltzmann Distribution 
        # with temperature temp. It works equally well without tf.exp
        self.sample = tf.multinomial(tf.exp(self.logits[:, -1] / self.temp), 1)[:, 0] 
        self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss, global_step=self.gstep)

    def train(self):
        start = time.time()
        min_loss = None

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)

        with tf.Session(config = tf.ConfigProto(gpu_options=gpu_options)) as sess:
            sess.run(tf.global_variables_initializer())
            
            iteration = self.gstep.eval()
            data = read_batch(
                read_data(self.path, num_steps, num_steps // 2),
                batch_size
            )

            while True:
                batch = next(data)
                
                # for batch in read_batch(read_data(DATA_PATH, vocab)):
                batch_loss, _ = sess.run([self.loss, self.opt], {self.seq: batch})
                
                if (iteration + 1) % skip == 0:
                    print('Iter {}. \n    Loss {}. Time {}'.format(iteration + 1, batch_loss, time.time() - start))

                    self.online_infer(sess)
                    start = time.time()

                iteration += 1

    def online_infer(self, sess):
        for seed in ["ngày"]:
            sentence = [seed]
            state = None

            for _ in range(len_generated):
                batch = [vocab_encode(sentence[-1])]
                feed = {self.seq: batch}

                if state is not None: # for the first decoder step, the state is None
                    for i in range(len(state)):
                        feed.update({self.in_state[i]: state[i]})

                index, state = sess.run([self.sample, self.out_state], feed)
                sentence += [vocab_decode(index)]

            print(''.join(sentence))

In [12]:
lm = RNN('bed-time-stories')
lm.create_model()

In [13]:
lm.train()

Iter 1. 
    Loss 54149.91015625. Time 3.468621015548706
ngàynúngthađòtrẻochuốitrắngđậpnhíchkhạothuỷphêsehụpvòngsínhgượngmướppexégáođênhoạxởcũichơthẹnnhiềuthoáiphónglủixươngnươnghìnnháosoạngbuồnlạplaudàiriêngrudọnháibươnmấndươthiugụcmảngnớichèothócđùixếpthoánnổkỷnhạynụnhiềụkiệuhẹnkhơmehủihãmđềmtôiquầncởnkhiếnuổngđuộtnhúnglứangậpđiềmlậtmọpthoiốnnhứngbậmsứcômtgiạtcòngcăcứlẹtrộikíchkhánnéokỹphìnhthápnhàihònhợpngàyngệchvãitúkĩhèmchậmtrêntươngkèobumthưởngtípngươnậptầmsựquýnhphổngchuối)hàogụcnghiãhẻovéođánbâykhoátriệuthăngsờkhỏinghéchậuphậntrầmluônđêchénbôiruôkhaonuốioămngáchđêđểtuyềnkẹtmẩuvíabưuxèohữngthuỷơnðànhàm1958lãivậplaaulữcườikhoaimạingútteðộcđựơcxuểtụmdatróimàiđốnangnhớtlàmbuồnsưalénghimquangðaucớdạttrối1954vệsétchăntàylưukhuyênxửlưuxácliềnchongnựckiếuvỉ
Iter 2. 
    Loss 53724.625. Time 3.0700552463531494
ngàyđàyđăngdụngtrắcluậnbưxỉkhúccácsữalãmquốchựccổgiẫmhãnrâtủyhệtkịbắt vô                  rê                                                              mới                      

KeyboardInterrupt: 