# Data sets

## GitHub issues

In [None]:
!wget https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip
!unzip github-issues.zip

## Python functions

# 1: Language Model

## Input Data

In [None]:
import pandas as pd
from numpy import array
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from ktext.preprocess import processor

from tqdm import tqdm

In [None]:
# with open('train.docstring', 'r') as f:
#     train_doc = f.readlines()

In [None]:
issues = pd.read_csv('github_issues.csv')
docs = list(issues.body)

In [None]:
proc = processor(hueristic_pct_padding=.7,
                 keep_n=20000)
vecs = proc.fit_transform(docs[:1000])

In [None]:
vocab_size = max(proc.id2token.keys()) + 1
max_length = proc.padding_maxlen

In [None]:
from tqdm import tqdm
sequences = []
for arr in tqdm(vecs):
    non_zero = (arr != 0).argmax()
    for i in range(non_zero, len(arr)):
        sequence = arr[:i+1]
        sequences.append(sequence)
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Lambda

i = Input(shape=(max_length-1,))
o = Embedding(vocab_size, 128, input_length=max_length-1)(i)
o = LSTM(50, return_sequences=True)(o)
last_timestep = Lambda(lambda x: x[:, -1, :])(o)
last_timestep = Dense(vocab_size, activation='softmax')(last_timestep)
model = Model(i, last_timestep)
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=20, batch_size=2048)

In [None]:
def generate_seq(model, proc, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = proc.transform([in_text])[:,1:]
        # pre-pad sequences to a fixed length
        yhat = np.argmax(model.predict(encoded, verbose=0), axis=1)[0]
        # map predicted word index to word
        out_word = ''
        if yhat == 1:
            out_word = '_unk_'
        else:
            out_word = proc.id2token[yhat]
        in_text += ' ' + out_word
    return in_text

In [None]:
generate_seq(model, proc, max_length, 'there', 10)

In [None]:
seq = 'def machine learning'
encoded = proc.transform([seq])[:,1:]

In [None]:
emb_model = Model(inputs=model.inputs, outputs=model.layers[-3].output)

In [None]:
emb_model.predict(encoded)

# 2: Sequence to Sequence Model

In [None]:
target_docs = list(issues.issue_title)

In [None]:
from ktext.preprocess import processor
func_proc = processor(hueristic_pct_padding=.7,
                      keep_n=20000)
func_vecs = func_proc.fit_transform(docs[:1000])

doc_proc = processor(append_indicators=True,
                     hueristic_pct_padding=.7,
                     keep_n=14000, padding ='post')
doc_vecs = doc_proc.fit_transform(target_docs[:1000])

In [None]:
encoder_input_data = func_vecs
encoder_seq_len = encoder_input_data.shape[1]

decoder_input_data = doc_vecs[:, :-1]
decoder_target_data = doc_vecs[:, 1:]

num_encoder_tokens = max(func_proc.id2token.keys()) + 1
num_decoder_tokens = max(doc_proc.id2token.keys()) + 1

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, GRU, Embedding, Lambda, BatchNormalization

word_emb_dim=800
hidden_state_dim=1000
encoder_seq_len=encoder_seq_len
num_encoder_tokens=num_encoder_tokens
num_decoder_tokens=num_decoder_tokens



encoder_inputs = Input(shape=(encoder_seq_len,), name='Encoder-Input')
# Word embeding for encoder (ex: Issue Titles, Code)
x = Embedding(num_encoder_tokens, word_emb_dim, name='Body-Word-Embedding',
              mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(hidden_state_dim, return_state=True,
                 name='Encoder-Last-GRU', dropout=.5)(x)

# Encapsulate the encoder as a separate entity so we can just
#  encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h,
                      name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

#### Decoder Model ####
# for teacher forcing
decoder_inputs = Input(shape=(None,), name='Decoder-Input')

# Word Embedding For Decoder (ex: Issue Titles, Docstrings)
dec_emb = Embedding(num_decoder_tokens, word_emb_dim,
                    name='Decoder-Word-Embedding',
                    mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(hidden_state_dim, return_state=True,
                  return_sequences=True, name='Decoder-GRU', dropout=.5)
decoder_gru_output, _ = decoder_gru(dec_bn,
                                    initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax',
                      name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

#### Seq2Seq Model ####
seq2seq_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
from keras import optimizers
import numpy as np

seq2seq_model.compile(optimizer=optimizers.Nadam(lr=0.00005),
                      loss='sparse_categorical_crossentropy')

batch_size = 1100
epochs = 16
history = seq2seq_model.fit([encoder_input_data, decoder_input_data],
                            np.expand_dims(decoder_target_data, -1),
                            batch_size=batch_size,
                            epochs=epochs,
                            validation_split=0.12)

In [None]:
def extract_decoder_model(model):
    # the latent dimension is the dmeinsion of the hidden state passed from the encoder to the decoder.
    latent_dim = model.get_layer('Encoder-Model').output_shape[-1]

    # Reconstruct the input into the decoder
    decoder_inputs = model.get_layer('Decoder-Input').input
    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)

    # Instead of setting the intial state from the encoder and forgetting about it, during inference
    # we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
    # the GRU, thus we define this input layer for the state so we can add this capability
    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')

    # we need to reuse the weights that is why we are getting this
    # If you inspect the decoder GRU that we created for training, it will take as input
    # 2 tensors -> (1) is the embedding layer output for the teacher forcing
    #                  (which will now be the last step's prediction, and will be _start_ on the first time step)
    #              (2) is the state, which we will initialize with the encoder on the first time step, but then
    #                   grab the state after the first prediction and feed that back in again.
    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

    # Reconstruct dense layers
    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
    decoder_model = Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])
    return decoder_model

In [None]:
encoder_model = seq2seq_model.get_layer('Encoder-Model')
decoder_model = extract_decoder_model(seq2seq_model)
raw_input_text = train_func[0]

In [None]:
decoder_model.summary()

In [None]:
max_len = doc_proc.padding_maxlen

raw_tokenized = func_proc.transform([raw_input_text])
encoding = encoder_model.predict(raw_tokenized)
# we want to save the encoder's embedding before its updated by decoder
#   because we can use that as an embedding for other tasks.
original_encoding = encoding
state_value = np.array(doc_proc.token2id['_start_']).reshape(1, 1)

decoded_sentence = []
stop_condition = False
while not stop_condition:
    preds, st = decoder_model.predict([state_value, encoding])

    # We are going to ignore indices 0 (padding) and indices 1 (unknown)
    # Argmax will return the integer index corresponding to the
    #  prediction + 2 b/c we chopped off first two
    pred_idx = np.argmax(preds[:, :, 2:]) + 2

    # retrieve word from index prediction
    pred_word_str = doc_proc.id2token[pred_idx]

    if pred_word_str == '_end_' or len(decoded_sentence) >= max_len:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)

print(' '.join(decoded_sentence))