## Steps
1. Batch data into look-back of $k$ sentences for each "training example" (in HAN-NMT paper, $k$ = 3)
    * This may also need to be done explicitly on the decoder side - unclear how easy that is to implement
2. Choose attention type for encoder-decoder comparisons
    * Standard dot-product
    * "Cross-attention": http://www.aclweb.org/anthology/P18-1103

## Model-testing

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import numpy as np

from models import *

Using TensorFlow backend.


In [None]:
class DummyArgs:
    n_train_examples = 100000
    n_valid_examples = 60000
    train_file = ''
    valid_file = ''
    n_epochs = 10
    batch_size = 128
    model_name = 'test'
    
dummy_args = DummyArgs()

In [None]:
import string
print(string.printable)

i_tokens = set(string.printable)
o_tokens = i_tokens
vocab = dict(zip(i_tokens, range(len(i_tokens))))
len_limit = 100
transformer = Transformer(dummy_args, vocab=vocab, len_limit=len_limit,  d_model=256,
                          d_inner_hid=512, n_head=4, d_k=64, d_v=64, layers=2, dropout=0.1, share_word_emb=True)

In [None]:
dummy_x = np.asarray([np.random.randint(0, 100, 50) for _ in range(20)])
dummy_x.shape

In [None]:
transformer.output_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
transformer.output_model.fit([dummy_x, dummy_x])

In [None]:
hist = transformer.model.fit([dummy_x, dummy_x])

In [None]:
hist.history

In [None]:
hist.history['loss'][0]

In [None]:
y = transformer.output_model.predict([dummy_x, dummy_x])

In [None]:
y.shape

In [None]:
# Testing loss/perplexity output
y_loss = transformer.model.evaluate([dummy_x, dummy_x])
y_loss

## Re-munging Data into Context - Response Pairs

In [None]:
import os
import pandas as pd
import numpy as np

data_path = '/data/users/kyle.shaffer/ubuntu-ranking-dataset-creator'
# train_df = pd.read_csv(os.path.join(data_path, 'src/train_seq.tsv'), sep='\t', encoding='utf8')
# print(train_df.shape)

# train_df.head(10)

In [None]:
df_train_orig = pd.read_csv(os.path.join(data_path, 'src/train.csv'), encoding='utf8')
print(df_train_orig.shape)
df_train_orig.head(10)

In [None]:
df_valid = pd.read_csv(os.path.join(data_path, 'src/test.csv'), encoding='utf8')
print(df_valid.shape)
df_valid.head()

In [None]:
df_train_orig = df_train_orig[df_train_orig.Label == 1]

In [None]:
df_train_orig['context'] = '<SOD> ' + df_train_orig.Context
df_train_orig.head(10)

In [None]:
df_train_orig.drop('Context', axis=1, inplace=True)
print(df_train_orig.shape)
df_train_orig.head()

In [None]:
df_train_orig.drop('Label', axis=1, inplace=True)
df_train_orig.head()

In [None]:
df_train_orig['response'] = df_train_orig.Utterance + ' <EOD>'
print(df_train_orig.shape)
df_train_orig.head()

In [None]:
df_train_orig.drop('Utterance', axis=1, inplace=True)
df_train_orig.head()

In [None]:
df_train_orig.context.values[0].split('__eot__')

In [None]:
df_train_orig['response'] = df_train_orig.response.apply(lambda x: ' '.join(x.split()[:-1]) + ' __eot__ ' + ' <EOD>')
print(df_train_orig.head())
df_train_orig.response.values[:5]

In [None]:
df_train_orig['dialog'] = df_train_orig.context + ' ' + df_train_orig.response
print(df_train_orig.head(10))

df_train_orig.dialog.values[0]

In [None]:
df_train_orig.dialog.values[10].split('__eot__')

In [None]:
df_train_orig.head()

In [None]:
import sys

def create_context_windows(df, window=2, response_col='Utterance'):
    # assert ('Context' in df.columns) and ('Utterance' in df.columns), 'Correct columns missing!'
    
    # Add SOD and EOD tags
    df['context'] = '<SOD> ' + df.Context
    df['response'] = df[response_col] + ' <EOD>'
    df['dialog'] = df.context + ' ' + df.response
    # df.drop('context', axis=1, inplace=True)
    # df.drop('response', axis=1, inplace=True)
    
    batched_dialogs = []
    for d_i, d in enumerate(df.dialog.tolist()):
        sys.stdout.write('\r Processing {}...'.format(d_i))
        d_turns = d.split('__eot__')
        eod_tok = d_turns.pop()
        d_turns = [turn + ' __eot__' for turn in d_turns]
        d_turns[-1] = d_turns[-1] + eod_tok
        for idx, turn in enumerate(d_turns):
            context_turn = d_turns[idx: idx+window]
            context_turn_string = ' '.join(context_turn)
            batched_dialogs.append(context_turn_string)
            if context_turn_string.split()[-1].strip() == '<EOD>':
                break
            
    return batched_dialogs

In [None]:
batched_dialogs = create_context_windows(df_valid, window=3, response_col='Ground Truth Utterance')
batched_dialogs[:10]

In [None]:
len(batched_dialogs)

In [None]:
batched_dialogs[0]

In [None]:
batched_dialogs[1]

In [None]:
batched_dialogs[2]

In [None]:
batched_dialogs[:10]

In [None]:
def create_pairs_data(dialog_list:list):
    context, response = [], []
    for d in dialog_list:
        d_turns = d.strip().split('__eot__')
        d_turns = [t for t in d_turns if t != '']
        if d[-1].strip() == '<EOD>':
            eod_tok = d.pop().strip()
            d[-1] = d[-1] + ' __eot__ ' + eod_tok
            
        context_turn = d_turns[:-1]
        response_turn = d_turns[-1]
        context_turn = [c + ' __eot__' for c in context_turn]
        if response_turn[-1].strip() == '<EOD>':
            response_turn = response_turn + ' __eot__ ' + ' <EOD>'
        else:
            response_turn = response_turn + ' __eot__ '
        
        context.append(' '.join(context_turn))
        # response.append(' '.join(response_turn))
        response.append(response_turn.strip())
        
    return pd.DataFrame({'context': context, 'response': response})

In [None]:
context_pairs = create_pairs_data(dialog_list=batched_dialogs)
print(context_pairs.shape)
context_pairs.head(10)

In [None]:
context_pairs.response.tolist()[:10]

In [None]:
response_fix = []

for r in context_pairs.response.tolist():
    if '<EOD>' in r:
        fix_r = [w for w in r.strip().split() if w != '<EOD>']
        fix_r.append('<EOD>')
        response_fix.append(' '.join(fix_r))
    else:
        response_fix.append(r)
        
    
response_fix[:10]

In [None]:
context_pairs['response_fix'] = response_fix
context_pairs.head(10)

In [None]:
context_pairs['response_fix'] = context_pairs.response.apply(lambda x: ' '.join(x.split()[:-2]) + ' __eot__ ' + ' <EOD>')
context_pairs.head(10)

In [None]:
context_pairs.drop('response', axis=1, inplace=True)
context_pairs.rename(columns={'response_fix': 'response'}, inplace=True)

In [None]:
context_pairs.head()

In [None]:
context_pairs.loc[0].context

In [None]:
context_pairs.loc[0].response

In [None]:
context_pairs.loc[1].context

In [None]:
context_pairs.loc[1].response

In [None]:
context_pairs[context_pairs.context.isnull()].shape

In [None]:
context_pairs['context'] = context_pairs.context.apply(lambda x: x.replace('\t', ' '))
context_pairs['response'] = context_pairs.response.apply(lambda x: x.replace('\t', ' '))
context_pairs.head(10)

In [None]:
context_pairs.to_csv('/data/users/kyle.shaffer/ubuntu-ranking-dataset-creator/data/test_seq.tsv',
                    sep='\t', encoding='utf8', index=False)

In [None]:
context_pairs.to_json('/data/users/kyle.shaffer/ubuntu-ranking-dataset-creator/data/test_seq.jl',
                     orient='records', lines=True)

## EDA for Max-Length Params

In [None]:
train_df = pd.read_csv('/data/users/kyle.shaffer/ubuntu-ranking-dataset-creator/data/train_seq.tsv', sep='\t',
                      encoding='utf8')
print(train_df.shape)

In [None]:
train_df.head(10)

In [None]:
# response_fixed = []

# for r in train_df.response.tolist():
#     if '<EOD>' in r.strip():
#         r_split = [w for w in r.strip().split() if w != '<EOD>']
#         r_split.append('<EOD>')
#         response_fixed.append(' '.join(r_split))
#     else:
#         response_fixed.append(r)
        
response_fixed[:30]

In [None]:
train_df['response_fixed'] = response_fixed
train_df.head(10)

In [None]:
pd.read_csv('/data/users/kyle.shaffer/ubuntu-ranking-dataset-creator/data/valid_seq.tsv', sep='\t').shape

In [None]:
train_df = train_df[['context', 'response_fixed']]
train_df.rename(columns={'response_fixed': 'response'}, inplace=True)
train_df.head(10)

In [None]:
train_df.shape

In [None]:
train_df.to_csv('/data/users/kyle.shaffer/ubuntu-ranking-dataset-creator/data/train_seq.tsv',
               sep='\t', encoding='utf8', index=False)

In [None]:
train_df.to_json('/data/users/kyle.shaffer/ubuntu-ranking-dataset-creator/data/train_seq.jl', orient='records', lines=True)

In [None]:
max_context_len = max(train_df.context.apply(lambda x: len(x.strip().split())).tolist())
max_response_len = max(train_df.response.apply(lambda x: len(x.strip().split())).tolist())

print(max_context_len)
print(max_response_len)

In [None]:
train_df['context_len'] = train_df.context.apply(lambda x: len(x.strip().split()))
train_df['response_len'] = train_df.response.apply(lambda x: len(x.strip().split()))
train_df.head()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

train_df.context_len.plot(kind='hist', bins=25, figsize=(8, 6))

In [None]:
train_df.response_len.plot(kind='hist', bins=25, figsize=(8, 6))

In [None]:
train_df[train_df.response_len > 700].head(1).response.values

In [None]:
train_df.response.values[:100]

In [None]:
train_df[train_df.context_len < 300].shape[0] / train_df.shape[0]

In [None]:
from collections import Counter

counter = Counter()
for c in train_df.context.tolist():
    counter.update(c.strip().split())
    
for r in train_df.response.tolist():
    counter.update(r.strip().split())
    
print('Total unique vocab items:', len(counter))

In [None]:
counter['operating']

In [None]:
with open('/data/users/kyle.shaffer/ubuntu-ranking-dataset-creator/data/vocab.txt', encoding='utf8', mode='w') as outfile:
    for w, c in counter.most_common():
        outfile.write(w)
        outfile.write('\t')
        outfile.write(str(c))
        outfile.write('\n')

In [None]:
counter['<EOD>']

In [None]:
vocab_df = pd.DataFrame()
vocab_df['word'] = list(counter.keys())
vocab_df['freq'] = list(counter.values())
vocab_df.head()

In [None]:
print(vocab_df.shape)
vocab_df[vocab_df.freq > 2].shape