In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
movie_vocab_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie_vocab.txt'
polar_vocab_path = '/data/users/kyle.shaffer/dialog_data/polar_vocab.txt'

movie_vocab = pd.read_csv(movie_vocab_path, sep='\t', encoding='utf8', names=['word', 'w_count', 'dialog_id'])

polar_voc = []

with open(polar_vocab_path, mode='r') as infile:
    for line in infile:
        w, w_c = line.strip().split('\t')
        polar_voc.append([w, int(w_c)])
        
polar_vocab = pd.DataFrame(polar_voc)
del polar_voc
polar_vocab.columns = ['word', 'w_count']
polar_vocab = polar_vocab[polar_vocab.w_count >= 10]
polar_vocab.head(10)

print(movie_vocab.shape)
print(polar_vocab.shape)

print('Number of overlapping words:', len(set(movie_vocab.word.values) & set(polar_vocab.word.values)))

In [None]:
movie_vocab[movie_vocab.w_count >= 10].shape

In [None]:
non_overlap_words = [w for w in movie_vocab.word.values if w not in set(polar_vocab.word.values)]
print(len(non_overlap_words))

In [None]:
new_movie_word_df = movie_vocab[movie_vocab.word.isin(set(non_overlap_words))]
print(new_movie_word_df.shape)
new_movie_word_df.head(10)

In [None]:
new_movie_word_df.head(40)

In [None]:
'This'.isupper()

In [None]:
s = '---a'
not(all(c in set(string.punctuation) for c in s))

In [None]:
import string

def filter_vocab(input_vocab):
    num_not_string = len([w for w in input_vocab if not(isinstance(w, str))])
    assert num_not_string == 0, '{} non-string items found...'.format(num_not_string)
    
    punct = set(string.punctuation)
    cut_words = {'<SOD>', '<EOD>', '``', "''", 'alL'}
    
    new_voc = [w for w in input_vocab if not(w.isupper())]
    new_voc = [w for w in new_voc if not w in cut_words]
    new_voc = [w for w in new_voc if not(all(c in punct for c in w))]
    
    return new_voc

In [None]:
print(new_movie_word_df.shape)
new_movie_word_df = new_movie_word_df[new_movie_word_df.word.notnull()]
print('New shape:', new_movie_word_df.shape)
new_movie_word_df.tail(20)

In [None]:
new_movie_words = filter_vocab(new_movie_word_df.word.tolist())
print(len(new_movie_words))

In [None]:
movie_vocab_filter = new_movie_word_df[new_movie_word_df.word.isin(set(new_movie_words))]
print(movie_vocab_filter.shape)
movie_vocab_filter.head(10)

## Munging Work: Need to Untokenize and Retokenize Movie Corpus

In [None]:
from nltk.tokenize import TweetTokenizer

tok = TweetTokenizer()
tok.tokenize("I'm sorry to hear about your friends.")

In [None]:
from sacremoses import MosesDetokenizer

detok = MosesDetokenizer(lang='en')
detok.detokenize("I 'm sorry to hear about your friends .".split())

In [None]:
cornell_train_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie_dialog_no_context_train.txt'
cornell_valid_path = '/data/users/kyle.shaffer/dialog_data/cornell_movie_dialog_no_context_valid.txt'

cornell_train_file = open(cornell_train_path, mode='r')
cornell_valid_file = open(cornell_valid_path, mode='r')

In [None]:
from tqdm import tqdm

with open('/data/users/kyle.shaffer/dialog_data/cornell_movie_dialog_no_context_valid_retok.txt', mode='w') as outfile:
    for line in tqdm(cornell_valid_file):
        left, right, conv_id = line.strip().split('\t')
        left_string = detok.detokenize(left.strip().split())
        right_string = detok.detokenize(right.strip().split())
        
        left_new = ' '.join(tok.tokenize(left_string))
        right_new = ' '.join(tok.tokenize(right_string))
        
        outfile.write(left_new)
        outfile.write('\t')
        outfile.write(right_new)
        outfile.write('\t')
        outfile.write(conv_id)
        outfile.write('\n')
        
cornell_valid_file.close()

In [None]:
from collections import Counter

vocab_cnt = Counter()

with open('/data/users/kyle.shaffer/dialog_data/cornell_movie_dialog_no_context_valid_retok.txt', mode='r') as infile:
    for line in infile:
        left, right, _ = line.strip().split('\t')
        vocab_cnt.update(left.split())
        vocab_cnt.update(right.split())
        
print('Done with valid set...')
        
with open('/data/users/kyle.shaffer/dialog_data/cornell_movie_dialog_no_context_train_retok.txt', mode='r') as infile:
    for line in infile:
        left, right, _ = line.strip().split('\t')
        vocab_cnt.update(left.split())
        vocab_cnt.update(right.split())
        
print('Done with training set...')

In [None]:
len(vocab_cnt)

In [None]:
movie_vocab_df = pd.DataFrame({'word': list(vocab_cnt.keys()), 'w_count': list(vocab_cnt.values())})
print(movie_vocab_df.shape)
movie_vocab_df.head()

In [None]:
movie_vocab_df[movie_vocab_df.w_count >= 10].head(10)

In [None]:
movie_vocab_df[movie_vocab_df.w_count >= 6].head(10)

In [None]:
len(set(movie_vocab_df[movie_vocab_df.w_count >= 6].word.apply(lambda x: x.lower()).tolist()))

In [None]:
movie_vocab_df[movie_vocab_df.w_count >= 6].w_count.sum() / movie_vocab_df.w_count.sum()

In [None]:
skips = {'<SOD>', '<EOD>'}
non_overlap_words = [w for w in movie_vocab_df[movie_vocab_df.w_count >= 6].word.tolist() \
                     if (w not in set(polar_vocab.word.values)) and not(w in skips)]
print(len(non_overlap_words))

In [None]:
movie_vocab_df['word_lower'] = movie_vocab_df.word.apply(lambda x: x.lower())
add_word_df = movie_vocab_df[movie_vocab_df.word_lower.isin(non_overlap_words)]
add_word_df.shape

In [None]:
add_word_df = add_word_df.drop_duplicates('word_lower')
add_word_df = add_word_df[add_word_df.word_lower.notnull()]
add_word_df.shape

## Combining Vocabs

In [None]:
# Reloading polar vocab and assigning ID's
def get_vocab(vocab_file, min_freq:int=3):
    special_toks = ['<UNK>', '<PAD>', '<s>', '</s>']
    c = {}
    word_idx = 1
    # with tf.gfile.GFile(vocab_file, 'r') as infile:
    with open(vocab_file, 'r') as infile:
        for line in infile:
            w, count = line.strip().split('\t')
            count = int(count)
            if (w in special_toks) or (count < min_freq):
                continue
            c[w.strip()] = word_idx
            word_idx += 1
        for st in special_toks:
            if st not in c.keys():
                if st == '<PAD>':
                    c[st] = 0
                else:
                    c[st] = max(c.values()) + 1
    print('VOCAB SIZE = {}'.format(len(c)))
    return c

polar_vocab = get_vocab(polar_vocab_path, min_freq=10)
print(max(polar_vocab.values()))

In [None]:
start_add_id = max(polar_vocab.values()) + 1
print(start_add_id)

for add_word in add_word_df.word_lower.tolist():
    polar_vocab[add_word] = start_add_id
    start_add_id += 1
    
print('New vocab size:', len(polar_vocab))
print('Max word ID:', max(polar_vocab.values()))

In [None]:
with open('/data/users/kyle.shaffer/dialog_data/polar_movie_combined_vocab.txt', mode='w') as outfile:
    for k, v in polar_vocab.items():
        outfile.write(k)
        outfile.write('\t')
        outfile.write(str(v))
        outfile.write('\n')

# Figuring out Re-setting the Embedding Layer

In [None]:
vocab = get_vocab('/data/users/kyle.shaffer/dialog_data/polar_movie_combined_vocab.txt', 0)

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [None]:
from keras.models import load_model
import keras.backend as K

model = load_model('/data/users/kyle.shaffer/chat_models/lstm_polar_chatbot_epoch34_loss4.107.h5',
                  custom_objects={'sparse_loss': lambda x, y: K.sparse_categorical_crossentropy(x, y, True)})
model.summary()

In [None]:
orig_embed = model.get_layer('embedding_1').get_weights()[0]
print(orig_embed.shape)

In [None]:
diff = len(vocab) - orig_embed.shape[0]
print(diff)

In [None]:
new_embed = np.random.normal(size=(diff, orig_embed.shape[1]))
print(new_embed.shape)

In [None]:
combined_embed = np.vstack((orig_embed, new_embed))
print(combined_embed.shape)

In [None]:
del new_embed; del orig_embed

In [None]:
for l in model.layers:
    print(l.name, l)

In [None]:
orig_logits, orig_bias = model.layers[-1].get_weights()
print(orig_logits.shape)
print(orig_bias.shape)

logits_diff = len(vocab) - orig_logits.shape[1]
logits_add = np.random.normal(size=(orig_logits.shape[0], logits_diff))
print(logits_add.shape)
bias_add = np.random.normal(size=(logits_diff))

logits_combined = np.hstack((orig_logits, logits_add))
bias_combined = np.hstack((orig_bias, bias_add))
print(logits_combined.shape)
print(bias_combined.shape)

In [None]:
from keras.layers import Model
from keras.layers import Embedding, Dense
from keras.initializers import Constant

new_embed_layer = Embedding(input_dim=combined_embed.shape[0], output_dim=combined_embed.shape[1], 
                           embeddings_initializer=Constant(combined_embed), mask_zero=True, trainable=True)

encoder_embed = new_embed_layer(model.layers[1].output)
encoder_embed = model.layers[3](encoder_embed)
encoder_outputs, _, _ = model.layers[4](encoder_embed)
encoder_outputs2, state_h, state_c = model.layers[6](encoder_outputs)
encoder_states = [state_h, state_c]

decoder_embed = new_embed_layer(model.layers[0].output)
decoder_embed = model.layers[5](decoder_embed)
decoder_outputs = model.layers[7](decoder_embed, initial_state=encoder_states)

# Reconstructing logits
orig_logits, orig_bias = model.layers[-1].get_weights()
print(orig_logits.shape)
print(orig_bias.shape)

logits_diff = len(vocab) - orig_logits.shape[1]
logits_add = np.random.normal(size=(orig_logits.shape[0], logits_diff))
print(logits_add.shape)
bias_add = np.random.normal(size=(logits_diff))

logits_combined = np.hstack((orig_logits, logits_add))
bias_combined = np.hstack((orig_bias, bias_add))
print(logits_combined.shape)
print(bias_combined.shape)

logits = Dense(units=len(vocab), activation='linear', name='logits')
logits_out = logits(decoder_outputs)

model = Model(inputs=[model.layers[1].output, model.layers[0].output], outputs=logits_out)
model.get_layer('logits').set_weights([logits_combined, bias_combined])
model.summary()