In [54]:
import json, nlp
import tensorflow as tf
import numpy as np
from contextlib import closing
from collections import Counter, Iterable
from gensim.models import KeyedVectors

from platform import python_version
print('Seed:', SEED)
print('Python:', python_version())
print('TensorFlow:', tf.__version__)

Seed: 764
Python: 3.5.4
TensorFlow: 1.4.0


## Data Analysis

In [38]:
class Metrics(ProgressTracker):
    '''
    TODO
    average utt per dial
    average word per utt
    total utt
    total word
    unique word
    '''
    def __init__(self, num_classes, label_name, ignore_negative=False, rate=1000, total=None):
        super().__init__(rate, total)
        self.num_classes = int(num_classes)
        self.label_name = label_name
        self.ignore_negative = ignore_negative
        self._baseline = 0
        self._distribution = Counter()
        self._vocabulary = Counter()
        self._frequencies = Counter()
       
    def __repr__(self):
        return str(self.rate)
   
    def __call__(self, sample=None, done=False):
        if not sample or done:
            super().__call__(True)
            print('Dialogues:', self.__len__())
            print('Utterances:')
            print('Words:')
            print('Vocabulary size:', len(self.vocabulary))
            print('Most common words:', dict(self.vocabulary.most_common(5)))
            print('Baseline accuracy: %.2f%%' % (self.baseline*100))
            print('Class distribution:', dict(self.distribution.most_common()))
            print('Frequencies distribution:', dict(self.frequencies.most_common(10)))
        else:
            super().__call__(False)
            self._distribution[sample[self.label_name]] += 1
            for k, v in sample.items():
                if self.ignore_negative and sample[self.label_name] == 0:
                    continue
                if k != self.label_name:
                    for i in self.flatten(v):
                        self._vocabulary[i] += 1
   
    def flatten(self, l):
        non_iterable = (str, bytes, int)
        if isinstance(l, non_iterable):
            yield l
        else:
            for el in l:
                if isinstance(el, Iterable) and not isinstance(el, non_iterable):
                    yield from self.flatten(el)
                else:
                    yield el
   
    @property
    def vocabulary(self):
        return self._vocabulary
   
    @property
    def distribution(self):
         return self._distribution
    
    @property
    def baseline(self):
        if not self._baseline:
            self._baseline = sum(v**2 for _,v in self.distribution.items())/self._counter
        return float(self._baseline)/float(self._counter)
    
    @property
    def frequencies(self):
        if not self._frequencies:
            for k, v in self._vocabulary.items():
                self._frequencies[v] += 1
        return self._frequencies
            
    def __len__(self):
        return self._counter
    
    def graphs():
        # prints graphs of word distribution, freq of freq, class distribution
        pass

## Data Conversion

In [90]:
class RecordWriter:
    '''
    A class wraper to write data into tfrecord shards. Note that
    currently tfrecord does not support appending. Therefore, the
    contextlib.closing() library is used to provide safe closing
    of file descriptors which stays open across method calls.
    Supports sharding and compression.{}
    '''
    def __init__(self, path, filename=None, shard_size=None, compression=None):
        self.path = path
        self.filename = filename if filename else 'shard'
        self.filename = self.path + '/' + self.filename
        self.shard_size = int(shard_size) if shard_size else 0
        self.compression = compression #TODO
        self.shards = 0
        self.counter = 0
        self.writer = None
        
    def __call__(self, features, labels):
        if self.counter == 0:    
            current_path = self.filename + '0.tfrecord'
            self.writer = tf.python_io.TFRecordWriter(current_path)
        elif self.shard_size and self.counter % self.shard_size == 0:
            self.shards += 1
            if self.counter != 0:
                self.writer.close()
                shard_counter = self.counter/self.shard_size
                current_path = self.filename + shard_counter + '.tfrecord'
                self.writer = tf.python_io.TFRecordWriter(current_path)
        
        self.tf_convert(features, labels)
        self.counter += 1

    
    def tf_convert(self, features, labels):
        '''
        Converts a single example into a tfrecord file.

        Parameters
        ----------
        features: dict of feature names and values
        labels: dict of class labels as int
        writer: a TFRecordWriter
        '''
        def wrap_int64(value):
            # Wrapper class labels
            return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

        def wrap_bytes(value):
            # Wrapper for any feature data as byte type
            return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

        def serialize(data):
            # Wrap the data dict as TensorFlow Features.
            # Wrap again as a TensorFlow Example.
            # Serialize the data.
            feature = tf.train.Features(feature=data)
            example = tf.train.Example(features=feature)
            return example.SerializeToString()

        # Wrap features as bytes, labels as in64 and combine dictionaries
        data = {**{k: wrap_int64(v) for k, v in features.items()}, 
                **{k: wrap_int64(v) for k, v in labels.items()}}

        serialized = serialize(data)
        self.writer.write(serialized)
        
    def close(self):
        if self.writer:
            self.writer.close()
        self.writer = None

## Data Processing

In [23]:
class Parser:
    def __init__(self, parse_fn, **kwargs):
        self.parse = parse_fn
        self.word2idx = kwargs['word2idx']
        self.contextresponse = kwargs['contextresponse']

    def __call__(self, string):
        samples = self.parse(string, self.word2idx)
        yield from self.contextresponse(samples)

In [52]:
def tokenize(string):
    return string.split()

def parse(string, word2idx):
    dialogue = json.loads(string)['chat']
    samples = []
    for utt in dialogue:
        utt = utt.lower()
        utt = tokenize(utt)
        
        # Convert tokens to embedding index
        if any(utt):
            for i, word in enumerate(utt):
                if word not in word2idx:
                    word2idx[word] =  len(word2idx)
                utt[i] = word2idx[word]
            samples.append(utt)
    return samples

# Code

In [63]:
data_path = 'data/2015-chats2_clean.json'
mini_path = 'data/mini.json'
w2v_path = 'data/GoogleNews-vectors-negative300.bin.gz'
shard_path = 'data/training'

In [19]:
print('Loading: word2vec')
tic = timer()
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
w2i = {k:i+1 for i,k in enumerate(w2v.index2word)}
i2w = {v:k for k,v in w2i.items()}
del w2v
toc = timer()
print(toc - tic)

Loading: word2vec
227.30807486277365


In [None]:
stream = RecordStream(data_path)
metrics = Metrics(2)

In [88]:
stream = RecordStream(mini_path)
augment = ContextResponse(len(stream), context_size=3, buffer_size=1e2)
parser = Parser(parse, word2idx=w2i, contextresponse=augment)
tracker = ProgressTracker()

Records: 100 scanned, 0.03MB, 0.00sec


In [89]:
with closing(RecordWriter(shard_path, shard_size=None)) as writer:
    for record in stream:
        for sample in parser(record):
            print(sample)
            writer({k:v for k,v in sample.items() if k in ['context', 'response']},
                   {k:v for k,v in sample.items() if k in ['label']})
            tracker()
tracker(True)

{'label': 0, 'response': [4, 3003532], 'context': [[260092], [1070, 16], [78493]]}


TypeError: [4, 3003532] has type <class 'list'>, but expected one of: ((<class 'bytes'>,),)

In [51]:
for i in metrics.vocabulary.most_common(10):
    print(i2w[i[0]])

gg
lol
i
ez
you
?
u
wp
a
is


In [61]:
{k:v for k,v in sample.items() if k in ['context', 'response']}

{'context': [[71462], [1590994], [200, 16]], 'response': [7727]}

In [95]:
sample

{'context': [[260092], [1070, 16], [78493]],
 'label': 0,
 'response': [4, 3003532]}

In [35]:
s = RecordStream(data_path)
t = ProgressTracker(total=len(s))
for i in s:
    t()

Records: 500032 scanned, 175.51MB, 0.67sec