# Setup Notebook

Install [ktext](https://github.com/hamelsmu/ktext) and [annoy](https://github.com/spotify/annoy)

In [None]:
# !pip install -q ktext
# !pip install -q annoy

In [None]:
import json
from urllib.request import urlopen

from annoy import AnnoyIndex
from keras import optimizers
from keras.layers import Input, Dense, LSTM, GRU, Embedding, Lambda, BatchNormalization
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from ktext.preprocess import processor
import numpy as np
import pandas as pd
import random
from tqdm import tqdm

# Data sets

## [English to French](http://www.manythings.org/anki/)

In [None]:
!wget http://www.manythings.org/anki/fra-eng.zip
!unzip -o fra-eng.zip

In [None]:
with open('fra.txt', 'r') as f:
    lines = f.readlines()
target_docs, source_docs = zip(*[line.strip().split('\t') for line in lines])
target_docs = list(set(target_docs))

## [CoNaLa](https://conala-corpus.github.io/)

In [None]:
!wget http://www.phontron.com/download/conala-corpus-v1.1.zip
!unzip -o conala-corpus-v1.1.zip

In [None]:
with open('conala-corpus/conala-mined.jsonl', 'r') as f:
    lines = [json.loads(l) for l in f.readlines()]
source_docs = [line['snippet'] for line in lines]
target_docs = [line['intent'] for line in lines]

In [None]:
with open('conala-corpus/conala-train.json', 'r') as f:
    lines = json.load(f)
test_docs = [line['rewritten_intent'] for line in lines if line['rewritten_intent']]

## GitHub issues data

In [None]:
issues = pd.read_csv('https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip')

In [None]:
source_docs = list(issues.body)
target_docs = list(issues.issue_title)

## Python functions data

In [None]:
f = urlopen('https://storage.googleapis.com/kubeflow-examples/code_search/data/train.function')
source_docs = [x.decode('utf-8') for x in f.readlines()]
f = urlopen('https://storage.googleapis.com/kubeflow-examples/code_search/data/train.docstring')
target_docs = [x.decode('utf-8') for x in f.readlines()]

## Use subsets

In [None]:
source_docs = source_docs[:10000]
target_docs = target_docs[:10000]

# 1: Language Model

## Input Data

In [None]:
proc = processor(hueristic_pct_padding=.7, keep_n=20000)
vecs = proc.fit_transform(target_docs)

In [None]:
vocab_size = max(proc.id2token.keys()) + 1
max_length = proc.padding_maxlen

In [None]:
sequences = []
for arr in tqdm(vecs):
    non_zero = (arr != 0).argmax()
    for i in range(non_zero, len(arr)):
        sequences.append(arr[:i+1])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
# y = to_categorical(y, num_classes=vocab_size)

In [None]:
i = Input(shape=(max_length-1,))
o = Embedding(vocab_size, 256, input_length=max_length-1)(i)
o = LSTM(256, return_sequences=True)(o)
last_timestep = Lambda(lambda x: x[:, -1, :])(o)
last_timestep = Dense(vocab_size, activation='softmax')(last_timestep)
model = Model(i, last_timestep)
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, y, epochs=10, batch_size=64, validation_split=0.1)

## Generate sequences

In [None]:
def generate_seq(model, proc, max_length, seed_text, n_words):
    in_text = seed_text
    for _ in range(n_words):
        vec = proc.transform([in_text])[:,1:]
        index = np.argmax(model.predict(vec, verbose=0), axis=1)[0]
        out_word = ''
        if index == 1:
            out_word = '_unk_'
        else:
            out_word = proc.id2token[index]
        in_text += ' ' + out_word
    return in_text

In [None]:
generate_seq(model, proc, max_length, 'there', 10)

## Generate embeddings

In [None]:
embedding_model = Model(inputs=model.inputs, outputs=model.layers[-3].output)

In [None]:
input_sequence = 'def machine learning'
vec = proc.transform([input_sequence])[:,1:]
embedding_model.predict(vec)

In [None]:
vecs = proc.transform(test_docs)

In [None]:
hidden_states = embedding_model.predict(vecs[:, 1:])

In [None]:
mean_vecs = np.mean(hidden_states, axis=1)
max_vecs = np.max(hidden_states, axis=1)
sum_vecs = np.sum(hidden_states, axis=1)

## Build vector indices

In [None]:
dimension = hidden_states.shape[-1]
index = AnnoyIndex(dimension)
for i, v in enumerate(sum_vecs):
    index.add_item(i, v)
index.build(10)

In [None]:
ids, _ = index.get_nns_by_item(1000, 10, include_distances=True)
[test_docs[i] for i in ids]

In [None]:
input_sequence = test_docs[random.randint(0, len(test_docs))]
print(input_sequence)

vec = proc.transform([input_sequence])[:,1:]
vec = np.sum(embedding_model.predict(vec), axis=1)
ids, _ = index.get_nns_by_vector(vec.T, 10, include_distances=True)
[test_docs[i] for i in ids]

# 2: Sequence to Sequence Model

In [None]:
source_proc = processor(hueristic_pct_padding=.7, keep_n=20000)
source_vecs = source_proc.fit_transform(source_docs)

target_proc = processor(append_indicators=True, hueristic_pct_padding=.7, keep_n=14000, padding ='post')
target_vecs = target_proc.fit_transform(target_docs)

In [None]:
encoder_input_data = source_vecs
encoder_seq_len = encoder_input_data.shape[1]

decoder_input_data = target_vecs[:, :-1]
decoder_target_data = target_vecs[:, 1:]

num_encoder_tokens = max(source_proc.id2token.keys()) + 1
num_decoder_tokens = max(target_proc.id2token.keys()) + 1

## Encoder Model

In [None]:
word_emb_dim=800
hidden_state_dim=1000
encoder_seq_len=encoder_seq_len
num_encoder_tokens=num_encoder_tokens
num_decoder_tokens=num_decoder_tokens

encoder_inputs = Input(shape=(encoder_seq_len,), name='Encoder-Input')
x = Embedding(num_encoder_tokens, word_emb_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)
_, state_h = GRU(hidden_state_dim, return_state=True, name='Encoder-Last-GRU', dropout=.5)(x)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)

## Decoder Model

In [None]:
decoder_inputs = Input(shape=(None,), name='Decoder-Input')
dec_emb = Embedding(num_decoder_tokens, word_emb_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
decoder_gru = GRU(hidden_state_dim, return_state=True, return_sequences=True, name='Decoder-GRU', dropout=.5)
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

## End to end

In [None]:
seq2seq_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
batch_size = 1100
epochs = 16

seq2seq_model.compile(optimizer=optimizers.Nadam(lr=0.00005), loss='sparse_categorical_crossentropy')
history = seq2seq_model.fit([encoder_input_data, decoder_input_data],
                            np.expand_dims(decoder_target_data, -1),
                            batch_size=batch_size,
                            epochs=epochs,
                            validation_split=0.1)

In [None]:
def extract_decoder_model(model):
    latent_dim = model.get_layer('Encoder-Model').output_shape[-1]
    decoder_inputs = model.get_layer('Decoder-Input').input
    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
    decoder_model = Model([decoder_inputs, gru_inference_state_input], [dense_out, gru_state_out])
    return decoder_model

In [None]:
encoder_model = seq2seq_model.get_layer('Encoder-Model')
decoder_model = extract_decoder_model(seq2seq_model)
decoder_model.summary()

In [None]:
max_len = target_proc.padding_maxlen
raw_input_text = source_docs[0]

raw_tokenized = source_proc.transform([raw_input_text])
encoding = encoder_model.predict(raw_tokenized)
original_encoding = encoding
state_value = np.array(target_proc.token2id['_start_']).reshape(1, 1)

decoded_sentence = []
stop_condition = False
while not stop_condition:
    preds, st = decoder_model.predict([state_value, encoding])
    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = target_proc.id2token[pred_idx]

    if pred_word_str == '_end_' or len(decoded_sentence) >= max_len:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)

' '.join(decoded_sentence)