In [1]:
import os
import csv
import random
import logging
from tqdm import tqdm, trange

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                     OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)

In [3]:
np.exp(3.9)

49.40244910553017

In [None]:
model_name = 'openai-gpt'
special_tokens = ['_start_', '_delimiter_', '_classify_']
gpt_tok = OpenAIGPTTokenizer.from_pretrained(model_name, special_tokens=special_tokens)

In [None]:
len(gpt_tok.bpe_ranks)

In [None]:
gpt_tok.convert_tokens_to_ids

In [None]:
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"

def load_rocstories_dataset(dataset_path):
    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
    with open(dataset_path, encoding='utf_8') as f:
        f = csv.reader(f)
        output = []
        next(f) # skip the first line
        for line in tqdm(f):
            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
    return output

In [None]:
def tokenize_and_encode(obj):
    """ Tokenize and encode a nested object """
    if isinstance(obj, str):
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    elif isinstance(obj, int):
        return obj
    return list(tokenize_and_encode(o) for o in obj)

In [None]:
roc_stories = cached_path(ROCSTORIES_URL)
print(roc_stories)
train_dataset = load_rocstories_dataset(roc_stories + '/cloze_test_val__spring201/cloze_test_ALL_val.csv')

In [None]:
# Still trying to figure out Attention class from TF docs
import os
import keras.backend as K
from keras.layers import Dense, Permute
from keras.layers import Layer

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# class BahdanauAttention(K.tf.keras.Model):
class BahdanauAttention(Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units, name="W1_mat")
        self.W2 = Dense(units, name="W2_mat")
        self.V = Dense(1, name="V_mat")

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = K.expand_dims(query, 1) # originally `tf.expand_dims`

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(K.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = K.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = K.tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [None]:
from keras.layers import GRU, Embedding, Bidirectional, Input
from keras.models import Model

in_layer = Input(shape=(None,))
embedding = Embedding(input_dim=20000, output_dim=128)(in_layer)
gru_out = GRU(units=128, return_sequences=True, return_state=True)(embedding)

gru_model = Model(inputs=in_layer, outputs=gru_out)

In [None]:
import numpy as np

x_fake = np.random.randint(0, 20000, size=(10, 30))
print(x_fake.shape)

In [None]:
gru_output, gru_state = gru_model(K.tf.convert_to_tensor(x_fake))
print(gru_output.shape, type(gru_output))
print(gru_state.shape, type(gru_state))

In [None]:
att_layer = BahdanauAttention(units=10)

In [None]:
att_results, att_weights = att_layer(gru_state, gru_output)

In [None]:
att_result, att_weights = att_layer(sample_hidden, sample_outputs)

In [None]:
a = np.random.normal(size=(4, 4))
b = np.random.normal(size=(4, 4))
print(a.shape, b.shape)

In [None]:
def combine(x, y, alpha=0.2):
    result = x + (alpha * y)
    return result

In [None]:
combine(a, b)

In [None]:
combine(a, b, alpha=1)

In [None]:
import tensorflow_datasets as tfds

bpe_tok_path = '/data/users/kyle.shaffer/dialog_data/polar_movie_combined_bpe.tok'

bpe_tok = tfds.features.text.SubwordTextEncoder.load_from_file(bpe_tok_path)
print(len(token_to_index))
print('BPE vocab size:', bpe_tok.vocab_size + 2)

In [None]:
def load_cakechat_data_with_tok(data_path):
    tok_lines = []
    # end_id = tokenizer.vocab_size + 1
    with open(data_path, mode='r') as infile:
        for ix, line in enumerate(infile):
            sys.stdout.write('\r Loading line {}...'.format(ix))
            json_line = json.loads(line.strip())
            for utt in json_line:
                text = utt['text'].strip()
                tok_lines.append(text)

    return tok_lines

train_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie/cakechat_model/corpora_processed/train_no_tok.txt'
valid_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie/cakechat_model/corpora_processed/valid_no_tok.txt'
train_lines = load_cakechat_data_with_tok(train_path)
valid_lines = load_cakechat_data_with_tok(valid_path)

all_lines = train_lines + valid_lines

bpe = tfds.features.text.SubwordTextEncoder.build_from_corpus(all_lines, target_vocab_size=20000)

In [None]:
bpe.subwords

In [None]:
bpe.decode(bpe.encode("This is definitely a sentence."))

In [None]:
bpe.save_to_file('/data/users/kyle.shaffer/dialog_data/movie_bpe.tok')

# Troubleshooting Manual Loss Computation

In [None]:
import os
import sys
sys.path.append('/home/kyle.shaffer/dialog_model_experiments/src/lm_exp')

from inference_models import *

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [None]:
max_len = 45

model_dir = '/data/users/kyle.shaffer/chat_models'
# Hack code for loading necessary data for 3rd party functions...
vocab_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie/cakechat_model/tokens_index/t_idx_processed_dialogs.json'
conditions_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie/cakechat_model/conditions_index/conditions_index.json'

with open(vocab_path, mode='r') as infile:
    token_to_index = json.load(infile)
    index_to_token = {int(v): k for k, v in token_to_index.items()}

with open(conditions_path, mode='r') as infile:
    index_to_condition = json.load(infile)
    index_to_condition = {int(k): v for k, v in index_to_condition.items()}
    print(index_to_condition)

condition_to_index = {v: k for k, v in index_to_condition.items()}

valid_name = 'valid_no_tok'
valid_data = load_conditioned_dataset(valid_name, token_to_index, condition_to_index, use_gpt_tok=True)
    
    
    
    
    

In [None]:
# s2s_path = os.path.join(model_dir, 'hierarch_cakechat_50_4.00.h5')
s2s_path = os.path.join(model_dir, 'hierarch_cakechat_12_4.08.h5')
lm_path = os.path.join(model_dir, 'movie_lm_07_4.13.h5')

fuse_model = FuseModel(s2s_path, lm_path)

In [None]:
datagen = fuse_model._get_batch_generator(input_data=(valid_data.x, valid_data.y), batch_size=10)

In [None]:
x, y = next(datagen)
# print(x.shape, y.shape)

In [None]:
x[0][0]

In [None]:
x[1][0]

In [None]:
y_out = np.argmax(fuse_model.s2s_model.predict(x), axis=-1)
print(y_out.shape)

In [None]:
y_out[1]

In [None]:
gpt_tok.encoder['_pad_'] = 0
gpt_tok.encoder['<unk>'] = 50000

In [None]:
toks = gpt_tok.tokenize("I think you're totally right.")
print(toks)
toks.insert(0, '_start_')
toks.append('_delimiter_')
print(toks)
print(gpt_tok.convert_tokens_to_ids(toks))

In [None]:
x, y = next(datagen)
s2s_logits = fuse_model.s2s_model.predict_on_batch(x)
print(type(s2s_logits))
print("Logits shape:", s2s_logits.shape)

loss_ = K.sparse_categorical_crossentropy(K.variable(y), K.variable(s2s_logits), from_logits=True)
print(type(loss_))
print("Loss shape:", loss_.shape)
loss_reduce = K.eval(K.tf.reduce_mean(loss_, axis=1))
print("Loss reduce type:", type(loss_reduce))
print("Loss reduce shape:", loss_reduce.shape)
print("Min:", loss_reduce.min())
print("Max:", loss_reduce.max())
print("Mean:", loss_reduce.mean())

In [None]:
keras_loss = fuse_model.s2s_model.test_on_batch(x, y)
print('Loss from Keras:', keras_loss)

In [None]:
print(x[1].shape)
x[1]

In [None]:
y.shape

In [None]:
s2s_logits.shape

In [None]:
loss_ = K.sparse_categorical_crossentropy(K.variable(y[:, :5]), K.variable(s2s_logits[:, :5, :]), from_logits=True)
loss = K.eval(loss_)
print(loss)
np.mean(loss)

In [None]:
loss_ = K.eval(K.sparse_categorical_crossentropy(K.variable(y), K.variable(s2s_logits), from_logits=True))

In [None]:
loss_.shape

In [None]:
unmasked_loss = loss_[:, :5]
unmasked_loss

In [None]:
np.mean(unmasked_loss.squeeze())

In [None]:
np.argmax(s2s_logits, axis=-1)

In [None]:
s2s_logits[:, :4].shape

In [None]:
y[:,:4]

In [None]:
y

In [None]:
len(y[y != 0])

In [None]:
# Code for manual computation of S2S loss
# Manual score: 3.3555065393447876
# Keras score: 2.938905715942383

def masked_categorical_crossentropy(y_true, y_pred, mask_value=0):
    # find out which timesteps in `y_true` are not the padding character '#'
    mask = K.all(K.equal(y_true, mask_value), axis=-1)
    mask = 1 - K.cast(mask, K.floatx())

    # multiply categorical_crossentropy with the mask
    loss = K.categorical_crossentropy(y_true, y_pred) * mask

    # take average w.r.t. the number of unmasked entries
    return K.sum(loss) / K.sum(mask)

def test_manual_loss(fuse_model, use_keras=False, batch_size=1, steps=10):
    datagen = fuse_model._get_batch_generator(input_data=(valid_data.x, valid_data.y), batch_size=batch_size)

    total_loss = 0

    for i in range(steps):
        x, y = next(datagen)
        if use_keras:
            step_loss = fuse_model.s2s_model.test_on_batch(x, y)
            print('Step loss:', step_loss)
            # total_loss += fuse_model.s2s_model.test_on_batch(x, y)
            total_loss += step_loss
        else:
            logits = fuse_model.s2s_model.predict_on_batch(x)
            # loss = K.eval(K.sparse_categorical_crossentropy(K.variable(y), K.variable(logits), from_logits=True))
            
            # Get mask here
            mask = K.all(K.equal(K.variable(y), 0), axis=-1)
            mask = 1 - K.cast(mask, K.floatx())
            losses = K.sparse_categorical_crossentropy(K.variable(y), K.variable(logits), from_logits=True)
            losses = losses * mask
            
            print("Loss shape:", losses.shape)
            loss = K.sum(losses) / K.sum(mask)
            step_loss = K.eval(loss)
            print('Step loss:', step_loss)
            total_loss += K.eval(loss)
            
            
            # batch_loss = 0

            # lengths = []
            # for ix in range(y.shape[0]):
            #     y_i = y[ix]
            #     lengths.append(len(y_i[y_i != 0]))
            
            # print('Lengths:', lengths)
                
            # loss_mask = K.tf.sequence_mask(lengths, K.tf.to_int32(y.shape[1]))
            # print('Loss mask shape:', loss_mask.shape)
            # losses = loss * K.tf.to_float(loss_mask)
            # np_losses = K.eval(losses)
            # mean_loss = 0
            # for i in range(np_losses.shape[0]):
            #     mean_loss += np.mean(np_losses[i].squeeze())
            # total_loss += mean_loss

    return total_loss / steps

In [None]:
test_manual_loss(fuse_model, use_keras=True, batch_size=10, steps=2)

In [None]:
test_manual_loss(fuse_model, use_keras=False, batch_size=10, steps=2)

In [None]:
loss[0][:2]

In [None]:
row

In [None]:
print(logits.shape, loss.shape)

In [None]:
# Code for manual computation of LM loss

datagen = fuse_model._get_batch_generator(input_data=(valid_data.x, valid_data.y), batch_size=1)

total_loss = 0
steps = 1

for i in range(steps):
    x, y = next(datagen)
    x_in = x[1].copy()
    x_in[x_in == 40479] = 0
    logits = fuse_model.lm_model.predict_on_batch(x_in)
    loss = K.eval(K.sparse_categorical_crossentropy(K.variable(y.squeeze()), K.variable(logits), from_logits=True))
    
    row_c = 0
    batch_loss = 0
    for row in loss:
        sample_loss = np.mean(row.squeeze())
        batch_loss += sample_loss
        row_c += 1

    total_loss += (batch_loss / row_c)
    
total_loss / steps

In [None]:
print(logits.shape, y.shape, loss.shape)

In [None]:
row

In [None]:
loss.shape

In [None]:
datagen = fuse_model._get_batch_generator(input_data=(valid_data.x, valid_data.y), batch_size=1)

total_loss = 0
steps = 1

for i in range(steps):
    x, y = next(datagen)
    x_in = x[1].copy()
    x_in[x_in == 40479] = 0
    
    batch_loss = fuse_model.lm_model.test_on_batch(x_in, y)
    total_loss += batch_loss
    
total_loss / steps

In [None]:
batch_loss

In [None]:
print(x_in.shape, y.shape)

In [None]:
np.argmax(logits, axis=-1)[1]

In [None]:
print(x_in.shape, y.shape)

In [None]:
x_in[1]

In [None]:
y[1]

In [None]:
print(logits.shape, loss.shape)

## Trying to build separate inference graph

In [None]:
lm = fuse_model.lm_model

In [None]:
s2s = fuse_model.s2s_model

In [None]:
del fuse_model

In [None]:
s2s

In [None]:
from keras.layers import Add
from keras.models import Model

In [None]:
s2s.inputs

In [None]:
new_lm_output = lm(s2s.inputs[1])

In [None]:
lm_weight_layer = Lambda(lambda x: 0.1 * x)(new_lm_output)
add_logits_layer = Add()([s2s.output, lm_weight_layer])
inf_graph = Model(inputs=s2s.inputs, outputs=add_logits_layer)
inf_graph.summary()

In [None]:
def sparse_loss(y_true, y_logits):
    return K.sparse_categorical_crossentropy(y_true, y_logits, from_logits=True)

In [None]:
inf_graph.compile(loss=sparse_loss, optimizer='adam')

In [None]:
x, y = next(datagen)
print(len(x), y.shape)

In [None]:
s2s.test_on_batch(x, y)

In [None]:
lm.test_on_batch(x[1], y)

In [None]:
inf_graph.test_on_batch(x, y)

In [None]:
s2s_out = s2s.predict_on_batch(x)

In [None]:
comb_out = inf_graph.predict_on_batch(x)

In [None]:
y

In [None]:
comb_tok_ids = np.argmax(comb_out, axis=-1)[0]

In [None]:
gpt_tok.decode(comb_tok_ids)

In [None]:
s2s_tok_ids = np.argmax(s2s_out, axis=-1)[0]
gpt_tok.decode(s2s_tok_ids)

In [None]:
for i in np.arange(0.1, 1.1, 0.1):
    print(np.round(i, 2))