In [6]:
from random import shuffle
import random
from operator import itemgetter
from collections import defaultdict

import keras
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
ENCODER_DATA = 'swda_parallel_corpus/encoder.txt'
DECODER_DATA = 'swda_parallel_corpus/decoder.txt'

TRAINSET_RATIO = 0.8
VOCABULARY_SIZE = 10000
MAX_INPUT_LENGTH = 80

PAD_ID = 0
UNK_ID = 1
PAD = '_PAD'
UNK = '_UNK'

random.seed(273)
np.random.seed(273)

In [8]:
def make_vocabulary(in_lines, max_vocabulary_size):
    freqdict = defaultdict(lambda: 0)
    for line in in_lines:
        for token in line:
            freqdict[token] += 1
    vocab = sorted(freqdict.items(), key=itemgetter(1), reverse=True)
    rev_vocab = ([PAD, UNK] + map(itemgetter(0), vocab))[:max_vocabulary_size]
    vocab = {word: idx for idx, word in enumerate(rev_vocab)}
    return vocab, rev_vocab


def vectorize_sequences(in_sequences, in_vocab, max_input_length):
    sequences_vectorized = []
    for sequence in in_sequences:
        sequences_vectorized.append([in_vocab.get(token, UNK_ID) for token in sequence])
    return keras.preprocessing.sequence.pad_sequences(sequences_vectorized, value=PAD_ID, maxlen=max_input_length)


def load_dataset(in_encoder_input, in_decoder_input):
    with open(in_encoder_input) as encoder_in:
        with open(in_decoder_input) as decoder_in:
            encoder_lines, decoder_lines = [map(lambda x: x.strip(), encoder_in.readlines()),
                                            map(lambda x: x.strip(), decoder_in.readlines())]
    return encoder_lines, decoder_lines


def make_tagger_data_point(in_src, in_tgt):
    source, target = in_src.lower().split(), in_tgt.lower().split()
    tags = []
    src_index, tgt_index = 0, 0
    while src_index < len(source):
        if tgt_index < len(target) and source[src_index] == target[tgt_index]:
            tags.append(1)
            tgt_index += 1
        else:
            tags.append(0)
        src_index += 1
    assert len(tags) == len(source)
    return source, tags


def make_tagger_data_points(in_encoder_lines, in_decoder_lines):
    result = []
    for src_line, tgt_line in zip(in_encoder_lines, in_decoder_lines):
        result.append(make_tagger_data_point(src_line, tgt_line))
    return result


def to_one_hot(in_sequence, in_classes_number):
    result = np.zeros((len(in_sequence), in_classes_number))
    for idx, element in enumerate(in_sequence):
        result[idx][element] = 1
    return result


def make_dataset(in_encoder_lines, in_decoder_lines, vocab=None):
    data_points = make_tagger_data_points(encoder_lines, decoder_lines)
    shuffle(data_points)
    trainset_size = int(TRAINSET_RATIO * len(data_points))
    devset_size = int((len(data_points) - trainset_size) / 2.0)
    train, dev, test = (data_points[:trainset_size],
                        data_points[trainset_size: trainset_size + devset_size],
                        data_points[trainset_size + devset_size:])
    if not vocab:
        vocab, _ = make_vocabulary(map(itemgetter(0), train), VOCABULARY_SIZE)
    X_train = vectorize_sequences(map(itemgetter(0), train), vocab, MAX_INPUT_LENGTH)
    y_train = np.asarray([to_one_hot(tags, 2)
                          for tags in keras.preprocessing.sequence.pad_sequences(map(itemgetter(1), train), value=0, maxlen=MAX_INPUT_LENGTH)])
    X_dev = vectorize_sequences(map(itemgetter(0), dev), vocab, MAX_INPUT_LENGTH)
    y_dev = np.asarray([to_one_hot(tags, 2)
                        for tags in keras.preprocessing.sequence.pad_sequences(map(itemgetter(1), dev), value=0, maxlen=MAX_INPUT_LENGTH)])
    X_test = vectorize_sequences(map(itemgetter(0), test), vocab, MAX_INPUT_LENGTH)
    y_test = np.asarray([to_one_hot(tags, 2)
                         for tags in keras.preprocessing.sequence.pad_sequences(map(itemgetter(1), test), value=0, maxlen=MAX_INPUT_LENGTH)])
    return vocab, (X_train, y_train), (X_dev, y_dev), (X_test, y_test)

In [9]:
def create_model(in_vocab_size, in_cell_size, in_max_input_length, in_classes_number, lr):
    input_sequence = keras.layers.Input(shape=(in_max_input_length,))
    embedding = keras.layers.Embedding(in_vocab_size, in_cell_size)(input_sequence)
    lstm = keras.layers.LSTM(in_cell_size, return_sequences=True)(embedding)
    output = keras.layers.Dense(in_classes_number, activation='softmax')(lstm)
    model = keras.Model(inputs=[input_sequence], outputs=[output])

    # mean absolute error, accuracy
    opt = keras.optimizers.Adam(lr=lr)
    model.compile(optimizer=opt, loss='binary_crossentropy')
    return model

In [10]:
def train(in_model,
          train_data,
          dev_data,
          test_data,
          in_checkpoint_filepath,
          epochs=100,
          batch_size=32,
          **kwargs):
    X_train, y_train = train_data
    X_dev, y_dev = dev_data
    X_test, y_test = test_data

    in_model.fit(X_train,
                 y_train,
                epochs=epochs,
                shuffle=True,
                validation_data=(X_dev, y_dev),
                callbacks=[keras.callbacks.ModelCheckpoint(in_checkpoint_filepath,
                                                           monitor='val_loss',
                                                           verbose=1,
                                                           save_best_only=True,
                                                           save_weights_only=False,
                                                           mode='auto',
                                                           period=1),
                           keras.callbacks.EarlyStopping(monitor='val_loss',
                                                         min_delta=0,
                                                         patience=10,
                                                         verbose=1,
                                                        mode='auto')])
    test_loss = in_model.evaluate(x=X_test, y=y_test)
    print 'Testset loss after {} epochs: {:.3f}'.format(epochs, test_loss)

In [11]:
def predict(in_model, X):
    return np.argmax(model.predict(np.asarray([X])), axis=-1)

In [12]:
def evaluate(in_model, X, y):
    y_pred = np.argmax(model.predict(X), axis=-1)
    y_gold = np.argmax(y, axis=-1)
    return sum([int(np.array_equal(y_pred_i, y_gold_i))
                for y_pred_i, y_gold_i in zip(y_pred, y_gold)]) / float(y.shape[0])

In [13]:
encoder_lines, decoder_lines = load_dataset(ENCODER_DATA, DECODER_DATA)
print encoder_lines[0], '\n', decoder_lines[0]

Uh well what would you say your opinion is on gun control 
what would you say your opinion is on gun control


In [14]:
vocab, train_data, dev_data, test_data = make_dataset(encoder_lines, decoder_lines)
model = create_model(len(vocab), 128, MAX_INPUT_LENGTH, 2, 0.01)

In [23]:
train(model, train_data, dev_data, test_data, 'model.h5')

Train on 109600 samples, validate on 13700 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.01920, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 0.01920 to 0.01902, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss did not improve
Epoch 4/100

Epoch 00004: val_loss did not improve
Epoch 5/100

Epoch 00005: val_loss did not improve
Epoch 6/100

Epoch 00006: val_loss did not improve
Epoch 7/100

Epoch 00007: val_loss did not improve
Epoch 8/100

Epoch 00008: val_loss did not improve
Epoch 9/100

Epoch 00009: val_loss did not improve
Epoch 10/100

Epoch 00010: val_loss did not improve
Epoch 11/100

Epoch 00011: val_loss did not improve
Epoch 12/100

Epoch 00012: val_loss did not improve
Epoch 00012: early stopping
Testset loss after 100 epochs: 0.020


In [57]:
evaluate(model, *test_data)

0.7242335766423358

In [1]:
BABI_ENCODER_DATA = 'dialogue_denoiser_data/dialog-babi-task1-API-calls-tst.txt/encoder.txt'
BABI_DECODER_DATA = 'dialogue_denoiser_data/dialog-babi-task1-API-calls-tst.txt/decoder.txt'

In [4]:
babi_encoder_lines, babi_decoder_lines = load_dataset(BABI_ENCODER_DATA, BABI_DECODER_DATA)
print babi_encoder_lines[0], '\n', babi_decoder_lines[0]

good uhm yeah good morning 
good morning


In [18]:
model = keras.models.load_model('model.h5')

In [19]:
_, babi_train_data, babi_dev_data, babi_test_data = make_dataset(babi_encoder_lines, babi_decoder_lines, vocab)

In [20]:
evaluate(model, *babi_test_data)

0.7462773722627737