In [1]:
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np

from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
import matplotlib.pyplot as plt
import json


Using TensorFlow backend.


In [2]:
###change data set into [('old', 'new'), (...]
json_filename = '/Users/yihe/Desktop/Stanford/yi_can_summarize/data/cnn/data.json'
sizes_of_text, sizes_of_summary= [] , []
with open(json_filename, 'r') as f:
    data = json.load(f)
all_data = []
for key in data.keys():
    if len(data[key])>1:
        all_data.append((data[key][0].strip(), data[key][1].strip()))
        
        size_of_text = len(data[key][0].strip().split(' '))
        size_of_summary = len(data[key][1].strip().split(' '))
        sizes_of_text.append(size_of_text)
        sizes_of_summary.append(size_of_summary)


In [3]:
Tx, Ty = int(np.percentile(sizes_of_text, 75)), int(np.percentile(sizes_of_summary, 75))
print Tx, Ty

73 14


In [4]:
m = len(all_data)
print m

1001


In [5]:
## change text into {'word':indx}
def create_vocab_list(vocab_text):
    vocab_list={}

    vocab_filename = "/Users/yihe/Desktop/Stanford/yi_can_summarize/data/cnn/vocab.txt"
    with open(vocab_text, 'r') as vocab_f:
        i=0
        for line in vocab_f:
            vocab_list[line.split(' ')[0]]=i
            i+=1
    
    return vocab_list

human_vocab_file = "/Users/yihe/Desktop/Stanford/yi_can_summarize/data/cnn/text_vocab.txt"
machine_vocab_file = "/Users/yihe/Desktop/Stanford/yi_can_summarize/data/cnn/highlight_vocab.txt"

human_vocab, machine_vocab = create_vocab_list(human_vocab_file), create_vocab_list(machine_vocab_file)
inv_machine = dict(enumerate(sorted(machine_vocab)))

In [6]:
print "number of vocab in human vocab {}".format(len(human_vocab.keys()))
print "number of vocab in machine vocab {}".format(len(machine_vocab.keys()))

machine_vocab['<UNK>']

number of vocab in human vocab 10471
number of vocab in machine vocab 4423


4421

In [7]:
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    
    X, Y = zip(*dataset)
    
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
    
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab.keys())), X)))
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab.keys())), Y)))
    
    return X, np.array(Y), Xoh, Yoh

def string_to_int(string, length, vocab):
    """
    Converts all strings in the vocabulary into a list of integers representing the positions of the
    input string's characters in the "vocab"
    
    Arguments:
    string -- input string, e.g. 'Wed 10 Jul 2007'
    length -- the number of time steps you'd like, determines if the output will be padded or cut
    vocab -- vocabulary, dictionary used to index every character of your "string"
    
    Returns:
    rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
    """
    
    #make lower to standardize
    string = string.split(' ')

    if len(string) > length:
        string = string[:length]

    rep = list(map(lambda x: vocab.get(x,vocab['<UNK>']), string))

    if len(string) < length:
        rep += [vocab['<PAD>']] * (length - len(string))

    #print (rep)
    return rep
   

In [8]:
X, Y, Xoh, Yoh = preprocess_data(all_data, human_vocab, machine_vocab, Tx, Ty)

In [9]:
print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

('X.shape:', (1001, 73))
('Y.shape:', (1001, 14))
('Xoh.shape:', (1001, 73, 10471))
('Yoh.shape:', (1001, 14, 4423))


In [43]:
Xoh[0][0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [10]:
#for softmax
import keras.backend as K
def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [11]:
# Defined shared layers as global variables
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights') # We are using a custom softmax(axis = 1) loaded in this notebook
dotor = Dot(axes = 1)

In [12]:
# GRADED FUNCTION: one_step_attention

def one_step_attention(a, s_prev):
    """
    Performs one step of attention: Outputs a context vector computed as a dot product of the attention weights
    "alphas" and the hidden states "a" of the Bi-LSTM.
    
    Arguments:
    a -- hidden state output of the Bi-LSTM, numpy-array of shape (m, Tx, 2*n_a)
    s_prev -- previous hidden state of the (post-attention) LSTM, numpy-array of shape (m, n_s)
    
    Returns:
    context -- context vector, input of the next (post-attetion) LSTM cell
    """
    
    ### START CODE HERE ###
    # Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a" (≈ 1 line)
    s_prev = repeator(s_prev)
    # Use concatenator to concatenate a and s_prev on the last axis (≈ 1 line)
    concat = concatenator([a,s_prev])
    # Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e. (≈1 lines)
    e = densor1(concat)
    # Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. (≈1 lines)
    energies = densor2(e)
    # Use "activator" on "energies" to compute the attention weights "alphas" (≈ 1 line)
    alphas =  activator(energies)
    # Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell (≈ 1 line)
    context = dotor([ alphas,a])
    ### END CODE HERE ###
    
    return context

In [13]:
n_a = 32
n_s = 64
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_vocab), activation=softmax)

In [14]:
Input(shape=(n_s,), name='s0')

<tf.Tensor 's0:0' shape=(?, 64) dtype=float32>

In [15]:
# GRADED FUNCTION: model

def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    """
    Arguments:
    Tx -- length of the input sequence
    Ty -- length of the output sequence
    n_a -- hidden state size of the Bi-LSTM
    n_s -- hidden state size of the post-attention LSTM
    human_vocab_size -- size of the python dictionary "human_vocab"
    machine_vocab_size -- size of the python dictionary "machine_vocab"

    Returns:
    model -- Keras model instance
    """
    
    # Define the inputs of your model with a shape (Tx,)
    # Define s0 and c0, initial hidden state for the decoder LSTM of shape (n_s,)
    X = Input(shape=(Tx, human_vocab_size))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    # Initialize empty list of outputs
    outputs = []
    
    ### START CODE HERE ###
    
    # Step 1: Define your pre-attention Bi-LSTM. Remember to use return_sequences=True. (≈ 1 line)
    a = Bidirectional(LSTM(n_a, return_sequences = True), input_shape = (m, Tx, n_a*2))(X)
    
    # Step 2: Iterate for Ty steps
    for t in range(Ty):
    
        # Step 2.A: Perform one step of the attention mechanism to get back the context vector at step t (≈ 1 line)
        context = one_step_attention(a, s)
        
        # Step 2.B: Apply the post-attention LSTM cell to the "context" vector.
        # Don't forget to pass: initial_state = [hidden state, cell state] (≈ 1 line)
        s, _, c =  post_activation_LSTM_cell(context,initial_state = [s, c])
        
        # Step 2.C: Apply Dense layer to the hidden state output of the post-attention LSTM (≈ 1 line)
        out = output_layer(s)
        
        # Step 2.D: Append "out" to the "outputs" list (≈ 1 line)
        outputs.append(out)
    
    # Step 3: Create model instance taking three inputs and returning the list of outputs. (≈ 1 line)
    model = Model([X, s0, c0], outputs = outputs)
    
    ### END CODE HERE ###
    
    return model

In [16]:
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))

In [17]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 73, 10471)    0                                            
__________________________________________________________________________________________________
s0 (InputLayer)                 (None, 64)           0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 73, 64)       2689024     input_1[0][0]                    
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 73, 64)       0           s0[0][0]                         
                                                                 lstm_1[0][0]                     
          

In [18]:
### START CODE HERE ### (≈2 lines)
opt = Adam(lr = 0.005, beta_1 = 0.9, beta_2 = 0.999,decay = 0.01)  
model.compile(loss = 'categorical_crossentropy', optimizer = opt,metrics = ['accuracy']) 
### END CODE HERE ###

In [44]:
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))

s0.shape
c0.shape

(1001, 64)

In [20]:
model.fit([Xoh, s0, c0], outputs, epochs=1, batch_size=100)

Epoch 1/1


<keras.callbacks.History at 0x126431c90>

In [21]:
model.save

(1001, 73, 10471)
(1001, 64)
(1001, 64)


In [37]:
EXAMPLES = ['it cost more than to develop and doesn t come with French fries on the side',
            'the response to the increasing natural disaster in pakistan is like so much else when it comes to American relations with that country too little and too tentative',
            'seven Republican presidential contenders faced off Monday in one of the first debates of the primary season, offering policy ideas and criticism of president']

for example in EXAMPLES:   
    source = [string_to_int(example, Tx, human_vocab)]
    source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source)))
    prediction = model.predict([source, s0, c0])
    prediction = np.argmax(prediction, axis = -1)
    print prediction
    output = [inv_machine[int(i)] for i in prediction]

    print "source: {}".format(example)
    print"output: {}".format(output)


[[4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]]
source: it cost more than to develop and doesn t come with French fries on the side
output: ['zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg']
[[4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]]
source: the response to the increasing natural disaster in pakistan is like so much else when it comes to American relations with that country too little and too tentative
output: ['zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg', 'zuckerberg']
[[4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [4422]
 [44

In [27]:
example ='it cost more than to develop and doesn t come with french fries on the side'
source = [string_to_int(example, Tx, human_vocab)]
print len(source)

source_rdy = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source)))
print source_rdy.shape

1
(1, 73, 10471)


In [28]:
model.predict([source_rdy, s0, c0])

[array([[0.00079054, 0.00090726, 0.00099326, ..., 0.00013752, 0.00014384,
         0.00124767]], dtype=float32),
 array([[2.0880259e-03, 2.7436044e-03, 3.2198075e-03, ..., 7.8820987e-05,
         8.6639528e-05, 5.0747734e-03]], dtype=float32),
 array([[3.2454950e-03, 4.5919311e-03, 5.6358930e-03, ..., 5.4559827e-05,
         6.1446604e-05, 1.0028267e-02]], dtype=float32),
 array([[3.6280635e-03, 5.2757971e-03, 6.5363408e-03, ..., 4.8720085e-05,
         5.5423032e-05, 1.2002281e-02]], dtype=float32),
 array([[3.7113139e-03, 5.4281582e-03, 6.7351214e-03, ..., 4.7513604e-05,
         5.4331780e-05, 1.2443182e-02]], dtype=float32),
 array([[3.7292594e-03, 5.4611787e-03, 6.7801578e-03, ..., 4.7258414e-05,
         5.4105876e-05, 1.2538862e-02]], dtype=float32),
 array([[3.7335069e-03, 5.4689110e-03, 6.7911176e-03, ..., 4.7198140e-05,
         5.4060267e-05, 1.2559942e-02]], dtype=float32),
 array([[3.7347209e-03, 5.4711783e-03, 6.7943223e-03, ..., 4.7180019e-05,
         5.4051245e-05, 1.2