<a href="https://colab.research.google.com/github/zixia/concise-chit-chat/blob/master/Concise_Chit_Chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Concise Chit Chat

GitHub Repository: <https://github.com/zixia/concise-chit-chat>

## Code TODO:

1. create a DataLoader class for dataset preprocess. (Use tf.data.Dataset inside?)
1. Create a PyPI package for easy load cornell movie curpos dataset(?)
1. Use PyPI module `embeddings` to load `GLOVES`, or use tfhub to load `GLOVES`?
1. How to do a `clip_norm`(or set `clip_value`) in Keras with Eager mode but without `tf.contrib`?
1. Better name for variables & functions
1. Code clean
1. Encapsulate all layers to Model Class: 
    1. ChitChatEncoder
    1. ChitChatDecoder
    1. ChitChatModel
1. Re-style to follow the book
1. ...?

## Book Todo

1. Outlines
1. What's seq2seq
1. What's word embedding
1. 
1. Split code into snips
1. Write for snips
1. Content cleaning and optimizing
1. ...?

## Other

1. `keras.callbacks.TensorBoard` instead of `tf.contrib.summary`?
    - `model.fit(callbacks=[TensorBoard(...)])`
1. download url? - http://old.pep.com.cn/gzsx/jszx_1/czsxtbjxzy/qrzptgjzxjc/dzkb/dscl/

### config.py

In [0]:
'''doc'''

# GO for start of the sentence
# DONE for end of the sentence
GO = '\b'
DONE = '\a'

# max words per sentence
MAX_LEN = 20


### data_loader.py

In [0]:
'''
data loader
'''
import gzip
import re
from typing import (
    # Any,
    List,
    Tuple,
)

import tensorflow as tf
import numpy as np

# from .config import (
#     GO,
#     DONE,
#     MAX_LEN,
# )

DATASET_URL = 'https://github.com/zixia/concise-chit-chat/releases/download/v0.0.1/dataset.txt.gz'
DATASET_FILE_NAME = 'concise-chit-chat-dataset.txt.gz'


class DataLoader():
    '''data loader'''

    def __init__(self) -> None:
        print('DataLoader', 'downloading dataset from:', DATASET_URL)
        dataset_file = tf.keras.utils.get_file(
            DATASET_FILE_NAME,
            origin=DATASET_URL,
        )
        print('DataLoader', 'loading dataset from:', dataset_file)

        # dataset_file = './data/dataset.txt.gz'

        # with open(path, encoding='iso-8859-1') as f:
        with gzip.open(dataset_file, 'rt') as f:
            self.raw_text = f.read().lower()

        self.queries, self.responses \
            = self.__parse_raw_text(self.raw_text)
        self.size = len(self.queries)

    def get_batch(
            self,
            batch_size=32,
    ) -> Tuple[List[List[str]], List[List[str]]]:
        '''get batch'''
        # print('corpus_list', self.corpus)
        batch_indices = np.random.choice(
            len(self.queries),
            size=batch_size,
        )
        batch_queries = self.queries[batch_indices]
        batch_responses = self.responses[batch_indices]

        return batch_queries, batch_responses

    def __parse_raw_text(
            self,
            raw_text: str
    ) -> Tuple[List[List[str]], List[List[str]]]:
        '''doc'''
        query_list = []
        response_list = []

        for line in raw_text.strip('\n').split('\n'):
            query, response = line.split('\t')
            query, response = self.preprocess(query), self.preprocess(response)
            query_list.append('{} {} {}'.format(GO, query, DONE))
            response_list.append('{} {} {}'.format(GO, response, DONE))

        return np.array(query_list), np.array(response_list)

    def preprocess(self, text: str) -> str:
        '''doc'''
        new_text = text

        new_text = re.sub('[^a-zA-Z0-9 .,?!]', ' ', new_text)
        new_text = re.sub(' +', ' ', new_text)
        new_text = re.sub(
            '([\w]+)([,;.?!#&-\'\"-]+)([\w]+)?',
            r'\1 \2 \3',
            new_text,
        )
        if len(new_text.split()) > MAX_LEN:
            new_text = (' ').join(new_text.split()[:MAX_LEN])
            match = re.search('[.?!]', new_text)
            if match is not None:
                idx = match.start()
                new_text = new_text[:idx+1]

        new_text = new_text.strip().lower()

        return new_text


### vocabulary.py

In [0]:
'''doc'''
import re
from typing import (
    List,
)

import tensorflow as tf

# from .config import (
#     DONE,
#     GO,
#     MAX_LEN,
# )


class Vocabulary:
    '''voc'''
    def __init__(self, text: str) -> None:
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
        self.tokenizer.fit_on_texts(
            [GO, DONE] + re.split(
                r'[\s\t\n]',
                text,
            )
        )
        # additional 1 for the index 0
        self.size = 1 + len(self.tokenizer.word_index.keys())

    def texts_to_padded_sequences(
            self,
            text_list: List[List[str]]
    ) -> tf.Tensor:
        '''doc'''
        sequence_list = self.tokenizer.texts_to_sequences(text_list)
        padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
            sequence_list,
            maxlen=MAX_LEN,
            padding='post',
            truncating='post',
        )

        return padded_sequences

    def padded_sequences_to_texts(self, sequence: List[int]) -> str:
        return 'tbw'


### model.py

In [0]:
'''doc'''
import tensorflow as tf
import numpy as np
from typing import (
    List,
)

# from .vocabulary import Vocabulary
# from .config import (
#     DONE,
#     GO,
#     MAX_LENGTH,
# )

EMBEDDING_DIM = 300
LATENT_UNIT_NUM = 500


class ChitEncoder(tf.keras.Model):
    '''encoder'''
    def __init__(
            self,
    ) -> None:
        super().__init__()

        self.lstm_encoder = tf.keras.layers.CuDNNLSTM(
            units=LATENT_UNIT_NUM,
            return_state=True,
        )

    def call(
            self,
            inputs: tf.Tensor,  # shape: [batch_size, max_len, embedding_dim]
            training=None,
            mask=None,
    ) -> tf.Tensor:
        _, *state = self.lstm_encoder(inputs)
        return state    # shape: ([latent_unit_num], [latent_unit_num])


class ChatDecoder(tf.keras.Model):
    '''decoder'''
    def __init__(
            self,
            voc_size: int,
    ) -> None:
        super().__init__()

        self.lstm_decoder = tf.keras.layers.CuDNNLSTM(
            units=LATENT_UNIT_NUM,
            return_sequences=True,
            return_state=True,
        )

        self.dense = tf.keras.layers.Dense(
            units=voc_size,
        )

        self.time_distributed_dense = tf.keras.layers.TimeDistributed(
            self.dense
        )

        self.initial_state = None

    def set_state(self, state=None):
        '''doc'''
        # import pdb; pdb.set_trace()
        self.initial_state = state

    def call(
            self,
            inputs: tf.Tensor,  # shape: [batch_size, None, embedding_dim]
            training=False,
            mask=None,
    ) -> tf.Tensor:
        '''chat decoder call'''

        # batch_size = tf.shape(inputs)[0]
        # max_len = tf.shape(inputs)[0]

        # outputs = tf.zeros(shape=(
        #     batch_size,         # batch_size
        #     max_len,            # max time step
        #     LATENT_UNIT_NUM,    # dimention of hidden state
        # ))

        # import pdb; pdb.set_trace()
        outputs, *states = self.lstm_decoder(inputs, initial_state=self.initial_state)
        self.initial_state = states

        outputs = self.time_distributed_dense(outputs)
        return outputs


class ChitChat(tf.keras.Model):
    '''doc'''
    def __init__(
            self,
            vocabulary: Vocabulary,
    ) -> None:
        super().__init__()

        self.word_index = vocabulary.tokenizer.word_index
        self.index_word = vocabulary.tokenizer.index_word
        self.voc_size = vocabulary.size

        # [batch_size, max_len] -> [batch_size, max_len, voc_size]
        self.embedding = tf.keras.layers.Embedding(
            input_dim=self.voc_size,
            output_dim=EMBEDDING_DIM,
            mask_zero=True,
        )

        self.encoder = ChitEncoder()
        # shape: [batch_size, state]

        self.decoder = ChatDecoder(self.voc_size)
        # shape: [batch_size, max_len, voc_size]

    def call(
            self,
            inputs: List[List[int]],  # shape: [batch_size, max_len]
            teacher_forcing_targets: List[List[int]]=None,  # shape: [batch_size, max_len]
            training=None,
            mask=None,
    ) -> tf.Tensor:     # shape: [batch_size, max_len, embedding_dim]
        '''call'''
        batch_size = tf.shape(inputs)[0]

        inputs_embedding = self.embedding(tf.convert_to_tensor(inputs))
        state = self.encoder(inputs_embedding)

        self.decoder.set_state(state)

        if training:
            teacher_forcing_targets = tf.convert_to_tensor(teacher_forcing_targets)
            teacher_forcing_embeddings = self.embedding(teacher_forcing_targets)

        # outputs[:, 0, :].assign([self.__go_embedding()] * batch_size)
        batch_go_embedding = tf.ones([batch_size, 1, 1]) * [self.__go_embedding()]
        batch_go_one_hot = tf.ones([batch_size, 1, 1]) * [tf.one_hot(self.word_index[GO], self.voc_size)]

        outputs = batch_go_one_hot
        output = self.decoder(batch_go_embedding)

        for t in range(1, MAX_LEN):
            outputs = tf.concat([outputs, output], 1)
            if training:
                target = teacher_forcing_embeddings[:, t, :]
                decoder_input = tf.expand_dims(target, axis=1)
            else:
                decoder_input = self.__indice_to_embedding(tf.argmax(output))

            output = self.decoder(decoder_input)

        return outputs

    def predict(self, inputs: List[int], temperature=1.) -> List[int]:
        '''doc'''

        outputs = self([inputs])
        outputs = tf.squeeze(outputs)

        word_list = []
        for t in range(1, MAX_LEN):
            output = outputs[t]

            indice = self.__logit_to_indice(output, temperature=temperature)

            word = self.index_word[indice]

            if indice == self.word_index[DONE]:
                break

            word_list.append(word)

        return ' '.join(word_list)

    def __go_embedding(self) -> tf.Tensor:
        return self.embedding(
            tf.convert_to_tensor(self.word_index[GO]))

    def __logit_to_indice(
            self,
            inputs,
            temperature=1.,
    ) -> int:
        '''
        [vocabulary_size]
        convert one hot encoding to indice with temperature
        '''
        inputs = tf.squeeze(inputs)
        prob = tf.nn.softmax(inputs / temperature).numpy()
        indice = np.random.choice(self.voc_size, p=prob)
        return indice

    def __indice_to_embedding(self, indice: int) -> tf.Tensor:
        tensor = tf.convert_to_tensor([[indice]])
        return self.embedding(tensor)


### Train

### Tensor Board

[Quick guide to run TensorBoard in Google Colab](https://www.dlology.com/blog/quick-guide-to-run-tensorboard-in-google-colab/)

`tensorboard` vs `tensorboard/` ?

In [5]:

LOG_DIR = '/content/data/tensorboard/'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)
# Install
! npm install -g localtunnel

# Tunnel port 6006 (TensorBoard assumed running)
get_ipython().system_raw('lt --port 6006 >> url.txt 2>&1 &')

# Get url
! cat url.txt

[K[?25h/tools/node/bin/lt -> /tools/node/lib/node_modules/localtunnel/bin/client
[K[?25h+ localtunnel@1.9.1
added 54 packages from 31 contributors in 2.667s
your url is: https://tame-walrus-3.localtunnel.me


In [6]:
'''train'''
import tensorflow as tf

# from chit_chat import (
#     ChitChat,
#     DataLoader,
#     Vocabulary,
# )

tf.enable_eager_execution()

data_loader = DataLoader()
vocabulary = Vocabulary(data_loader.raw_text)
chitchat = ChitChat(vocabulary=vocabulary)


def loss(model, x, y) -> tf.Tensor:
    '''doc'''
    weights = tf.cast(
        tf.not_equal(y, 0),
        tf.float32,
    )

    prediction = model(
        inputs=x,
        teacher_forcing_targets=y,
        training=True,
    )

    # implment the following contrib function in a loop ?
    # https://stackoverflow.com/a/41135778/1123955
    # https://stackoverflow.com/q/48025004/1123955
    return tf.contrib.seq2seq.sequence_loss(
        prediction,
        tf.convert_to_tensor(y),
        weights,
    )


def grad(model, inputs, targets):
    '''doc'''
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)

    return tape.gradient(loss_value, model.variables)


def train() -> int:
    '''doc'''
    learning_rate = 1e-3
    num_batches = 8000
    batch_size = 128

    print('Dataset size: {}, Vocabulary size: {}'.format(
        data_loader.size,
        vocabulary.size,
    ))

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    root = tf.train.Checkpoint(
        optimizer=optimizer,
        model=chitchat,
        optimizer_step=tf.train.get_or_create_global_step(),
    )

    root.restore(tf.train.latest_checkpoint('./data/save'))
    print('checkpoint restored.')

    writer = tf.contrib.summary.create_file_writer('./data/tensorboard')
    writer.set_as_default()

    global_step = tf.train.get_or_create_global_step()

    for batch_index in range(num_batches):
        global_step.assign_add(1)

        queries, responses = data_loader.get_batch(batch_size)

        encoder_inputs = vocabulary.texts_to_padded_sequences(queries)
        decoder_outputs = vocabulary.texts_to_padded_sequences(responses)

        grads = grad(chitchat, encoder_inputs, decoder_outputs)

        optimizer.apply_gradients(
            grads_and_vars=zip(grads, chitchat.variables)
        )

        if batch_index % 10 == 0:
            print("batch %d: loss %f" % (batch_index, loss(
                chitchat, encoder_inputs, decoder_outputs).numpy()))
            root.save('./data/save/model.ckpt')
            print('checkpoint saved.')

        with tf.contrib.summary.record_summaries_every_n_global_steps(1):
            # your model code goes here
            tf.contrib.summary.scalar('loss', loss(
                chitchat, encoder_inputs, decoder_outputs).numpy())
            # print('summary had been written.')

    return 0


def main() -> int:
    '''doc'''
    return train()


main()


DataLoader downloading dataset from: https://github.com/zixia/concise-chit-chat/releases/download/v0.0.1/dataset.txt.gz
Downloading data from https://github.com/zixia/concise-chit-chat/releases/download/v0.0.1/dataset.txt.gz
DataLoader loading dataset from: /root/.keras/datasets/concise-chit-chat-dataset.txt.gz
Dataset size: 158015, Vocabulary size: 5001
checkpoint restored.
batch 0: loss 8.378550
checkpoint saved.
batch 10: loss 5.363611
checkpoint saved.
batch 20: loss 5.347839
checkpoint saved.
batch 30: loss 5.355069
checkpoint saved.
batch 40: loss 5.263589
checkpoint saved.
batch 50: loss 5.107776
checkpoint saved.
batch 60: loss 5.089061
checkpoint saved.
batch 70: loss 4.997322
checkpoint saved.
batch 80: loss 4.978524
checkpoint saved.
batch 90: loss 4.913832
checkpoint saved.
batch 100: loss 4.860683
checkpoint saved.
batch 110: loss 4.823889
checkpoint saved.
batch 120: loss 4.892643
checkpoint saved.
batch 130: loss 4.658375
checkpoint saved.
batch 140: loss 4.795345
checkp

ResourceExhaustedError: ignored

In [26]:
#! rm -fvr data/tensorboard
# ! pwd
# ! rm -frv data/save
# ! rm -fr /content/data/tensorboard
# ! kill 2823
# ! kill -9 2823
# ! ps axf | grep lt


removed 'data/save/model.ckpt-8.data-00000-of-00001'
removed 'data/save/checkpoint'
removed 'data/save/model.ckpt-11.data-00000-of-00001'
removed 'data/save/model.ckpt-8.index'
removed 'data/save/model.ckpt-1.data-00000-of-00001'
removed 'data/save/model.ckpt-6.index'
removed 'data/save/model.ckpt-4.index'
removed 'data/save/model.ckpt-4.data-00000-of-00001'
removed 'data/save/model.ckpt-12.index'
removed 'data/save/model.ckpt-9.data-00000-of-00001'
removed 'data/save/model.ckpt-3.data-00000-of-00001'
removed 'data/save/model.ckpt-9.index'
removed 'data/save/model.ckpt-7.index'
removed 'data/save/model.ckpt-10.data-00000-of-00001'
removed 'data/save/model.ckpt-2.index'
removed 'data/save/model.ckpt-3.index'
removed 'data/save/model.ckpt-6.data-00000-of-00001'
removed 'data/save/model.ckpt-11.index'
removed 'data/save/model.ckpt-7.data-00000-of-00001'
removed 'data/save/model.ckpt-2.data-00000-of-00001'
removed 'data/save/model.ckpt-1.index'
removed 'data/save/model.ckpt-5.index'
remove

In [2]:
! cat url.txt

your url is: https://bright-fox-51.localtunnel.me


### chat.py

In [0]:
'''train'''
# import tensorflow as tf

# from chit_chat import (
#     ChitChat,
#     DataLoader,
#     Vocabulary,
#     DONE,
#     GO,
# )

# tf.enable_eager_execution()


def main() -> int:
    '''chat main'''
    data_loader = DataLoader()
    vocabulary = Vocabulary(data_loader.raw_text)

    print('Dataset size: {}, Vocabulary size: {}'.format(
        data_loader.size,
        vocabulary.size,
    ))

    chitchat = ChitChat(vocabulary)
    checkpoint = tf.train.Checkpoint(model=chitchat)
    checkpoint.restore(tf.train.latest_checkpoint('./data/save'))
    print('checkpoint restored.')

    return cli(chitchat, vocabulary=vocabulary, data_loader=data_loader)


def cli(chitchat: ChitChat, data_loader: DataLoader, vocabulary: Vocabulary):
    '''command line interface'''
    index_word = vocabulary.tokenizer.index_word
    word_index = vocabulary.tokenizer.word_index
    query = ''
    while True:
        try:
            # Get input sentence
            query = input('> ').lower()
            # Check if it is quit case
            if query == 'q' or query == 'quit':
                break
            # Normalize sentence
            query = data_loader.preprocess(query)
            query = '{} {} {}'.format(GO, query, DONE)
            # Evaluate sentence
            query_sequence = vocabulary.texts_to_padded_sequences([query])[0]

            response_sequence = chitchat.predict(query_sequence, 1)

            # Format and print response sentence
            response_word_list = [
                index_word[indice]
                for indice in response_sequence
                if indice != 0 and indice != word_index[DONE]
            ]

            print('Bot:', ' '.join(response_word_list))

        except KeyError:
            print("Error: Encountered unknown word.")


main()


In [9]:
! cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) CPU @ 2.30GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2299.998
cache size	: 46080 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm pti fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms xsaveopt arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf
bogomips	: 4599.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) 