[View in Colaboratory](https://colab.research.google.com/github/hamelsmu/kdd-2018-hands-on-tutorials/blob/master/Feature%20Extraction%20and%20Summarization%20with%20Sequence%20to%20Sequence%20Learning.ipynb)

# Setup Notebook

Install [ktext](https://github.com/hamelsmu/ktext)

In [1]:
! pip install ktext > install_logs.txt

[31mthinc 6.10.2 has requirement cytoolz<0.9,>=0.8, but you'll have cytoolz 0.9.0.1 which is incompatible.[0m
[31mthinc 6.10.2 has requirement msgpack-numpy==0.4.1, but you'll have msgpack-numpy 0.4.3.1 which is incompatible.[0m


# Data sets

## GitHub issues data

In [2]:
import pandas as pd
issues = pd.read_csv('https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip')
source_docs = list(issues.body)
target_docs = list(issues.issue_title)

## Python functions data

In [3]:
from urllib.request import urlopen
f = urlopen('https://storage.googleapis.com/kubeflow-examples/code_search/data/train.function')
source_docs = [x.decode('utf-8') for x in f.readlines()]
f = urlopen('https://storage.googleapis.com/kubeflow-examples/code_search/data/train.docstring')
target_docs = [x.decode('utf-8') for x in f.readlines()]

In [4]:
source_docs = source_docs[:1000]
target_docs = target_docs[:1000]

# 1: Language Model

## Input Data

In [5]:
from keras import optimizers
from keras.layers import Input, Dense, LSTM, GRU, Embedding, Lambda, BatchNormalization
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from ktext.preprocess import processor
import numpy as np
import pandas as pd
from tqdm import tqdm

Using TensorFlow backend.


In [6]:
proc = processor(hueristic_pct_padding=.7, keep_n=20000)
vecs = proc.fit_transform(source_docs)

 See full histogram by insepecting the `document_length_stats` attribute.


In [7]:
vocab_size = max(proc.id2token.keys()) + 1
max_length = proc.padding_maxlen

In [8]:
sequences = []
for arr in tqdm(vecs):
    non_zero = (arr != 0).argmax()
    for i in range(non_zero, len(arr)):
        sequences.append(arr[:i+1])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

100%|██████████| 1000/1000 [00:00<00:00, 24795.04it/s]


In [9]:
i = Input(shape=(max_length-1,))
o = Embedding(vocab_size, 128, input_length=max_length-1)(i)
o = LSTM(50, return_sequences=True)(o)
last_timestep = Lambda(lambda x: x[:, -1, :])(o)
last_timestep = Dense(vocab_size, activation='softmax')(last_timestep)
model = Model(i, last_timestep)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 59)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 59, 128)           390016    
_________________________________________________________________
lstm_1 (LSTM)                (None, 59, 50)            35800     
_________________________________________________________________
lambda_1 (Lambda)            (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3047)              155397    
Total params: 581,213
Trainable params: 581,213
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, y, epochs=20, batch_size=2048)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20

Epoch 20/20


## Generate sequences

In [13]:
def generate_seq(model, proc, max_length, seed_text, n_words):
    in_text = seed_text
    for _ in range(n_words):
        vec = proc.transform([in_text])[:,1:]
        index = np.argmax(model.predict(vec, verbose=0), axis=1)[0]
        out_word = ''
        if index == 1:
            out_word = '_unk_'
        else:
            out_word = proc.id2token[index]
        in_text += ' ' + out_word
    return in_text

In [14]:
generate_seq(model, proc, max_length, 'there', 10)

'there self self self self self self self self self self'

## Generate embeddings

In [15]:
input_sequence = 'def machine learning'
vec = proc.transform([input_sequence])[:,1:]

In [16]:
embedding_model = Model(inputs=model.inputs, outputs=model.layers[-3].output)

In [17]:
embedding_model.predict(vec)

array([[[ 0.11232524, -0.19589685, -0.08292498, ...,  0.11231813,
         -0.04093925,  0.2250973 ],
        [ 0.33013782, -0.50921947, -0.33424243, ...,  0.3724169 ,
         -0.22250906,  0.5329745 ],
        [ 0.710988  , -0.83309007, -0.7124429 , ...,  0.7527665 ,
         -0.6078541 ,  0.82568073],
        ...,
        [ 1.        , -1.        , -0.7387216 , ...,  1.        ,
         -1.        ,  0.36398363],
        [ 0.8879844 , -0.92908955, -0.699541  , ...,  0.9367341 ,
         -0.97065234,  0.451724  ],
        [ 0.9398087 , -0.9845134 , -0.74498785, ...,  0.975736  ,
         -1.        ,  0.48387393]]], dtype=float32)

# 2: Sequence to Sequence Model

In [18]:
from ktext.preprocess import processor
source_proc = processor(hueristic_pct_padding=.7, keep_n=20000)
source_vecs = source_proc.fit_transform(source_docs)

target_proc = processor(append_indicators=True, hueristic_pct_padding=.7, keep_n=14000, padding ='post')
target_vecs = target_proc.fit_transform(target_docs)

 See full histogram by insepecting the `document_length_stats` attribute.
 See full histogram by insepecting the `document_length_stats` attribute.


In [19]:
encoder_input_data = source_vecs
encoder_seq_len = encoder_input_data.shape[1]

decoder_input_data = target_vecs[:, :-1]
decoder_target_data = target_vecs[:, 1:]

num_encoder_tokens = max(source_proc.id2token.keys()) + 1
num_decoder_tokens = max(target_proc.id2token.keys()) + 1

## Encoder Model

In [20]:
word_emb_dim=800
hidden_state_dim=1000
encoder_seq_len=encoder_seq_len
num_encoder_tokens=num_encoder_tokens
num_decoder_tokens=num_decoder_tokens

encoder_inputs = Input(shape=(encoder_seq_len,), name='Encoder-Input')
x = Embedding(num_encoder_tokens, word_emb_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)
_, state_h = GRU(hidden_state_dim, return_state=True, name='Encoder-Last-GRU', dropout=.5)(x)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)

## Decoder Model

In [21]:
decoder_inputs = Input(shape=(None,), name='Decoder-Input')
dec_emb = Embedding(num_decoder_tokens, word_emb_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
decoder_gru = GRU(hidden_state_dim, return_state=True, return_sequences=True, name='Decoder-GRU', dropout=.5)
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

## End to end

In [22]:
seq2seq_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [23]:
batch_size = 1100
epochs = 16

seq2seq_model.compile(optimizer=optimizers.Nadam(lr=0.00005), loss='sparse_categorical_crossentropy')
history = seq2seq_model.fit([encoder_input_data, decoder_input_data],
                            np.expand_dims(decoder_target_data, -1),
                            batch_size=batch_size,
                            epochs=epochs,
                            validation_split=0.12)

Train on 880 samples, validate on 120 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [24]:
def extract_decoder_model(model):
    latent_dim = model.get_layer('Encoder-Model').output_shape[-1]
    decoder_inputs = model.get_layer('Decoder-Input').input
    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
    decoder_model = Model([decoder_inputs, gru_inference_state_input], [dense_out, gru_state_out])
    return decoder_model

In [25]:
encoder_model = seq2seq_model.get_layer('Encoder-Model')
decoder_model = extract_decoder_model(seq2seq_model)
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 800)    1719200     Decoder-Input[0][0]              
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 800)    3200        Decoder-Word-Embedding[1][0]     
__________________________________________________________________________________________________
hidden_state_input (InputLayer) (None, 1000)         0                                            
__________________________________________________________________________________________________
Decoder-GR

In [27]:
max_len = target_proc.padding_maxlen
raw_input_text = source_docs[0]

raw_tokenized = source_proc.transform([raw_input_text])
encoding = encoder_model.predict(raw_tokenized)
original_encoding = encoding
state_value = np.array(target_proc.token2id['_start_']).reshape(1, 1)

decoded_sentence = []
stop_condition = False
while not stop_condition:
    preds, st = decoder_model.predict([state_value, encoding])
    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = target_proc.id2token[pred_idx]

    if pred_word_str == '_end_' or len(decoded_sentence) >= max_len:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)

' '.join(decoded_sentence)

'a icon the cancelled parameter client infra channelfilter dependency numbers happened used made sentences client traversal into emit internal builds'