In [1]:
from tensorflow import keras 
from tensorflow.keras.layers import Input, LSTM, Attention, Embedding, Dense, Concatenate, TimeDistributed   #Layers required to implement the model


In [2]:

import numpy as np   #Package for scientific computing and dealing with arrays
import pandas as pd  #Package providing fast, flexible and expressive data structures
import re            #re stands for RegularExpression providing full support for Perl-like Regular Expressions in Python
from bs4 import BeautifulSoup   #Package for pulling data out of HTML and XML files
from keras.preprocessing.sequence import pad_sequences  #For Padding the seqences to same length
from nltk.corpus import stopwords   #For removing filler words
from tensorflow.keras.layers import Input, LSTM, Attention, Embedding, Dense, Concatenate, TimeDistributed   #Layers required to implement the model
from tensorflow.keras.models import Model  #Helps in grouping the layers into an object with training and inference features
from tensorflow.keras.callbacks import EarlyStopping  #Allows training the model on large no. of training epochs & stop once the performance stops improving on validation dataset
from os import listdir
import string

In [3]:

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a document into news story and highlights
def split_story(doc):
	# find first highlight
	index = doc.find('@highlight')
	# split into story and highlights
	story, highlights = doc[:index], doc[index:].split('@highlight')
	# strip extra white space around each highlight
	highlights = [h.strip() for h in highlights if len(h) > 0]
	return story, highlights

# load all stories in a directory
def load_stories(directory):
	stories = list()
	for name in listdir(directory):
		filename = directory + '/' + name
		# load document
		doc = load_doc(filename)
		# split into story and highlights
		story, highlights = split_story(doc)
		# store
		stories.append({'story':story, 'highlights':highlights})
	return stories

# clean a list of lines
def clean_lines(lines):
	cleaned = list()
	# prepare a translation table to remove punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines:
		# strip source cnn office if it exists
		index = line.find('(CNN) -- ')
		if index > -1:
			line = line[index+len('(CNN)'):]
		# tokenize on white space
		line = line.split()
		# convert to lower case
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [w.translate(table) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		cleaned.append(' '.join(line))
	# remove empty strings
	cleaned = [c for c in cleaned if len(c) > 0]
	return cleaned

# load stories
directory = 'cnn/'
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))

# clean stories
for example in stories:
	example['story'] = clean_lines(example['story'].split('\n'))
	example['highlights'] = clean_lines(example['highlights'])

Loaded Stories 92578


In [None]:
# save to file
from pickle import dump
dump(stories, open('cnn_dataset.pkl', 'wb'))

In [4]:
from pickle import load

# load from file
stories = load(open('cnn_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(stories))

Loaded Stories 92578


In [5]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import RepeatVector


In [7]:
(stories[0]['story'])

['it s official us president barack obama wants lawmakers to weigh in on whether to use military force in syria',
 'obama sent a letter to the heads of the house and senate on saturday night hours after announcing that he believes military action against syrian targets is the right step to take over the alleged use of chemical weapons',
 'the proposed legislation from obama asks congress to approve the use of military force to deter disrupt prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction',
 'it s a step that is set to turn an international crisis into a fierce domestic political battle',
 'there are key questions looming over the debate what did un weapons inspectors find in syria what happens if congress votes no and how will the syrian government react',
 'in a televised address from the white house rose garden earlier saturday the president said he would take his case to congress not because he has to but because he wants to

In [8]:
#Preprocessing

#This the dictionary used for expanding contractions
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [9]:
#Text Cleaning
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english')) 
def text_cleaner(text,num):
    newString = text.lower()  #converts all uppercase characters in the string into lowercase characters and returns it
    newString = BeautifulSoup(newString, "lxml").text #parses the string into an lxml.html 
    newString = re.sub(r'\([^)]*\)', '', newString) #used to replace a string that matches a regular expression instead of perfect match
    newString = re.sub('"','', newString)           
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")]) #for expanding contractions using the contraction_mapping dictionary    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    if(num==0): 
      tokens = [w for w in newString.split() if not w in stop_words]  #converting the strings into tokens
    else :
      tokens = newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:                  #removing short words
            long_words.append(i)   
    return (" ".join(long_words)).strip()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
ss  = edict()
ss["idan"] = 1
ss["idan"]+1
ss

{'idan': 1}

In [3]:
from easydict import EasyDict as edict

In [26]:
sen = "ackage stopwords is already up-to-date!"
print(sen.strip("!"))


ackage stopwords is already up-to-date


In [26]:
#Calling the function
x_vocab = edict()
y_vocab = edict()

article_word_count = []
abstract_word_count = []

cleaned_text = []
highlights = []

max_ar_length = 0
max_high_lengh = 0

for file in stories:
    t = file['story']
    story =""
    highligh= ""
    h = file['highlights']
    for line in t:
        story= story+line
        for word in line: 
            if word not in x_vocab.keys():
                x_vocab[word] =1
            else: 
                x_vocab[word]+=1

    if len(story.split())> max_ar_length: max_ar_length =len(story.split())
    article_word_count.append(len(story.split()))
    cleaned_text.append(text_cleaner(story,0))
    for line in h:
        highligh= highligh+line
        if word not in y_vocab.keys():
            y_vocab[word] =1
        else: 
            y_vocab[word]+=1

    if len(highligh.split())> max_high_lengh: max_high_lengh =len(highligh.split())
    abstract_word_count.append(len(highligh.split()))
    highlights.append(highligh)

In [27]:
# print summer: 
print(f" Max length sequence story: {max_ar_length}")
print(f" Max length sequence story: {max_high_lengh}")



 Max length sequence story: 2050
 Max length sequence story: 103


In [147]:
type(len(set(cleaned_text[1].split())))

int

In [198]:
X_tokenizer = Tokenizer(len(set(cleaned_text[0].split()))+1)
X_tokenizer.fit_on_texts(cleaned_text[0].split())


In [57]:
Y_tokenizer = Tokenizer(90) 
Y_tokenizer.fit_on_texts(highlights[0])
Y_voc = Y_tokenizer.num_words

In [28]:
def word_to_idx(data: list) -> dict:
    """
    Function that maps the data and return a dictionary of words corresponding to their index
    it gets a list
    return:
        dict 1 idx to word
        dict 2 word to idx
    """
    total_letters = [letters for sublist in data for subsublist in sublist for letters in subsublist]
    unique_letters =set(total_letters)
    total_words = [word.replace(',','') for sublist in data for subsublist in sublist for word in subsublist.split()]
    unique_words =list(set(total_words))

    w_2_i = {unique_words[i]:i for i in range(len(unique_words))}
    i_2_w= {i: unique_words[i] for i in range(len(unique_words))}
    print(w_2_i)
    input()
    return (w_2_i, i_2_w)

In [199]:
X_train= X_tokenizer.texts_to_sequences(cleaned_text[0].split()) 
y_train  = Y_tokenizer.texts_to_sequences(highlights[0])

In [164]:
print(X_tokenizer.num_words)

249


In [165]:
X_voc = X_tokenizer.num_words

In [166]:
type(highlights[1])

str

In [201]:
y = np.array([x for y in y_train for x in y])
x =np.array([x for y in X_train for x in y])

In [202]:
x = x.reshape(1,-1)
y  =y.reshape(1,-1)


In [213]:
Y_voc

90

In [218]:
X_voc

249

In [222]:

from keras import backend as K 
K.clear_session()  #Resets all state generated by Keras

latent_dim = 100
embedding_dim = 100

# Encoder
encoder_inputs = Input(shape=(x.shape[1],))

# TODO: understand how embedding works here; is this our own trained embedding? maybe we should just use word2vec
#embedding layer
enc_emb =  Embedding(1000, embedding_dim,trainable=True, mask_zero = True)(encoder_inputs)

#encoder lstm
# TODO: why are we not using an activation function? default is none, only for recurrent activation the default is sigmoid
# TODO: why do we need the encoder_outputs? only for attention probably
encoder_lstm = LSTM(latent_dim,return_sequences=True,return_state=True) #add dropout
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

#Setting up the Decoder using 'encoder_states' as initial state
# TODO: figure out why the shape is None? because we also use it later for our decoding when we only give one
decoder_inputs = Input(shape=(y.shape[1],))

#Embedding layer
dec_emb_layer = Embedding(Y_voc+1, embedding_dim,trainable=True, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
# TODO: is the LSTM bidirectional? why do we even need those two states?
# TODO: why do the graphs say (None, 256) where 256 is the latent_dim (the length of the state vectors); shouldn't it be clear that it is (1, 256)?
decoder_outputs ,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

# #Attention layer; removed for now
# attn_layer = AttentionLayer(name='attention_layer')
# attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# #Concating Attention input and Decoder LSTM output
# decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#Dense layer
# TODO: figure out what TimeDistributed does
decoder_dense =  TimeDistributed(Dense(units = Y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

#Defining the model 
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

#Visualize the Model
from tensorflow.keras.utils import plot_model

plot_model(model, to_file='training_model_plot.png', show_shapes=True, show_layer_names=True)

# TODO: understand this
#Adding Metrics
model.compile(optimizer='rmsprop' , loss='sparse_categorical_crossentropy' , metrics=['accuracy'])

#Adding Callback
# TODO: how exactly does this work? is there an internal mapping of the string "val_loss" to running a test on the specified validation_data?
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

# Commented out IPython magic to ensure Python compatibility.
#Training the Model
# %tensorflow_version 1.x
# indexing is clear, removing the eos token for the decoder inpu
# ts and removing the sos token for the decoder outputs
# TODO: think about how exactly this is working with calculating the loss etc., maybe that's the problem
# TODO: why is y 3-dimensional
# TODO: fit only on one example and check if model actually works
history = model.fit(x = [x,y], y = y.reshape(1,-1,1) ,epochs=200,callbacks=[es],batch_size = 1)




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 875)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 228)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 875, 100)     100000      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 228, 100)     9100        ['input_2[0][0]']                
                                                                                              

In [227]:
predict = model.predict([x,y])

In [229]:
for i in predict:
    print(np.argmax

(1, 228, 90)

In [230]:

encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

dec_emb2= dec_emb_layer(decoder_inputs) 

#Setting the initial states to the states from the previous time step for better prediction
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

# TODO: remove this
# #Attention inference
# attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
# decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

#Adding Dense softmax layer to generate proability distribution over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2) 

#Final Decoder model
# TODO: how is this all linked to what we trained before? where does the parameter sharing happen?
# TODO: why are we appending the lists instead of just writing it in one list
decoder_model = Model(
    [decoder_inputs] + [decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

#Function defining the implementation of inference process
def decode_sequence(input_seq):
    #Encoding the input as state vectors
    # TODO: plotting to see if this is actually the trained encoder and it is, but why?
    plot_model(encoder_model, to_file='encoder_model_plot.png', show_shapes=True, show_layer_names=True)
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    #Generating empty target sequence of length 1
    target_seq = np.zeros((1,1))
    
    #Populating the first word of target sequence with the start word
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_h, e_c])

        #Sampling a token
        # TODO: understand the indexing: why -1 instead of 0 as well? should the output tokens be of shape (1, 1, num_words)?
        # TODO: I added the plus one because I think the neurons in the dense layer start at 0 but our dictionary starts at 1, despite a one-on-one mapping from
        # dictionary index numbers to neurons if I understand correctly
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        print(sampled_token)
        input()
        sampled_token = target_index_word[sampled_token_index+1]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        #Exit condition: either hit max length or find stop word
        # TODO: it used to be >= (max_abstract_len - 1); why? doesn't make sense to me
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) == MAX_ABSTRACT_LEN):
            stop_condition = True

        #Updating the target sequence (of length 1)
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        #Updating internal states
        e_h, e_c = h, c

    return decoded_sentence

#Functions to convert an integer sequence to a word sequence for summary as well as reviews 
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+target_index_word[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+source_index_word[i]+' '
    return newString

#Summaries generated by the model

# TODO: change this, will crash when there is less than 20
#for i in range(0,20):
#print("Article:",seq2text(X_train[1]))
print("Original summary:",seq2summary(y))
print("article to be decoded:")
#print(X_train[i].reshape(1,MAX_ARTICLE_LEN))
print("Predicted summary:",decode_sequence(x))
print("\n")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [32]:
df['abstract'] = df['abstract'].apply(lambda x : 'sostok '+ x + ' eostok')

#Splitting the Dataset twice to get 80% training data, 10% of validation data and 10% of test data
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(np.array(df['article']),np.array(df['abstract']),test_size=0.2,random_state=0,shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.5, random_state = 0, shuffle = True)


NameError: name 'df' is not defined

In [33]:
vocab_size =200
src_txt_length =200
sum_txt_length = 200
# encoder input model
inputs = Input(shape=(src_txt_length,))
encoder1 = Embedding(vocab_size, 128)(inputs)
encoder2 = LSTM(128)(encoder1)
encoder3 = RepeatVector(sum_txt_length)(encoder2)
# decoder output model
decoder1 = LSTM(128, return_sequences=True)(encoder3)
outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder1)
# tie it together
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [35]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [37]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

NameError: name 'MAX_LENGTH' is not defined