#       Seq2Seq: Text Summarization with Keras
####


## Process
1. Preprocessing
2. Word2vec
3. Building Seq2Seq Architecture
4. Training with  BBC article&summary Dataset
5. Generate Summary with my_summarizer

## Step 1. Import Data

In [16]:
import numpy as np
import os
import pandas as pd
import re

In [None]:
pwd

'/content'

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:
news_category = ["business", "entertainment", "politics", "sport", "tech"]

row_doc = "/content/gdrive/My Drive/Colab Notebooks/NLP/Abstract Summarizer Bi-lstm/BBC News Summary/News Articles/"
summary_doc = "/content/gdrive/My Drive/Colab Notebooks/NLP/Abstract Summarizer Bi-lstm/BBC News Summary/Summaries/"
# row_doc = "C:\\Users\\hinish's laptop\\Desktop\\abstarct_summarizer_bi_lstm\\BBC News Summary\\News Articles\\"
# summary_doc = "C:\\Users\\hinish's laptop\\Desktop\\abstarct_summarizer_bi_lstm\\BBC News Summary\\Summaries\\"
#![](http://abigailsee.com/img/pointer-gen.png)

data={"articles":[], "summaries":[]}

In [12]:
import os
directories = {"news": row_doc, "summary": summary_doc}
row_dict = {}
sum_dict = {}

for path in directories.values():
    if path == row_doc:
        file_dict = row_dict
    else:
        file_dict = sum_dict
    dire = path
    for cat in news_category:
        category = cat
        files = os.listdir(dire + category)
        file_dict[cat] = files

In [13]:
row_data = {}
for cat in row_dict.keys():
    cat_dict = {}
    # row_data_frame[cat] = []
    for i in range(0, len(row_dict[cat])):
        filename = row_dict[cat][i]
        path = row_doc + cat + "/" + filename
        with open(path, "rb") as f:
            text = f.read()
            cat_dict[filename[:3]] = text
    row_data[cat] = cat_dict

In [None]:
sum_data = {}
for cat in sum_dict.keys():
    cat_dict = {}
    # row_data_frame[cat] = []
    for i in range(0, len(sum_dict[cat])):
        filename = sum_dict[cat][i]
        path = summary_doc + cat + "/" + filename
        with open(path, "rb") as f:
            text = f.read()
            cat_dict[filename[:3]] = text
    sum_data[cat] = cat_dict

In [None]:
news_business = pd.DataFrame.from_dict(row_data["business"], orient="index", columns=["row_article"])
news_business.head(10)

In [17]:
#  news_category = ["business", "entertainment", "politics", "sport", "tech"]
news_entertainment = pd.DataFrame.from_dict(row_data["entertainment"], orient="index", columns=["row_article"])
news_politics = pd.DataFrame.from_dict(row_data["politics"], orient="index", columns=["row_article"])
news_sport = pd.DataFrame.from_dict(row_data["sport"], orient="index", columns=["row_article"])
news_tech = pd.DataFrame.from_dict(row_data["tech"], orient="index", columns=["row_article"])

In [18]:
# summary data
summary_business = pd.DataFrame.from_dict(sum_data["business"], orient="index", columns=["summary"])
summary_entertainment = pd.DataFrame.from_dict(sum_data["entertainment"], orient="index", columns=["summary"])
summary_politics = pd.DataFrame.from_dict(sum_data["politics"], orient="index", columns=["summary"])
summary_sport = pd.DataFrame.from_dict(sum_data["sport"], orient="index", columns=["summary"])
summary_tech = pd.DataFrame.from_dict(sum_data["tech"], orient="index", columns=["summary"])

In [19]:
summary_business.head()

Unnamed: 0,summary
47,"b'""With this new order and new pricing in plac..."
10,"b""A US government claim accusing the country's..."
17,"b""On Tuesday, the company's administrator, tur..."
15,"b'In addition, if a flight is cancelled or del..."
43,b'Women will be employed in Saudi Arabia\'s fo...


In [20]:
business = news_business.join(summary_business, how='inner')
entertainment = news_entertainment.join(summary_entertainment, how='inner')
politics = news_politics.join(summary_politics, how='inner')
sport = news_sport.join(summary_sport, how='inner')
tech = news_tech.join(summary_tech, how='inner')

In [21]:
business = news_business.join(summary_business, how='inner')

In [22]:
business.head()

Unnamed: 0,row_article,summary
12,b'Indonesians face fuel price rise\n\nIndonesi...,"b""Indonesia's government has confirmed it is c..."
4,b'High fuel prices hit BA\'s profits\n\nBritis...,"b'Rod Eddington, BA\'s chief executive, said t..."
8,"b'India calls for fair trade rules\n\nIndia, w...","b""At a conference on developing enterprise hos..."
10,"b""Court rejects $280bn tobacco case\n\nA US go...","b""A US government claim accusing the country's..."
5,"b""Pernod takeover talk lifts Domecq\n\nShares ...","b""Pernod has reduced the debt it took on to fu..."


In [23]:
print("row", len(business.iloc[0,0]))
print("sum", len(business.iloc[0,1]))

row 1847
sum 834


In [24]:
list_df = [business, entertainment, politics, sport, tech]
length = 0
for df in list_df:
    length += len(df)

In [25]:
print("length of all data: ", length)

length of all data:  2225


In [26]:
bbc_df = pd.concat([business, entertainment, politics, sport, tech], ignore_index=True)
len(bbc_df)

2225

## Step 2. Preprocessing Text Data
1. Clean Text
2. Tokenize
3. Vocabrary
4. Padding
5. One-Hot Encoding
6. Reshape to (MAX_LEN, One-Hot Encoding DIM)

### 2-1. Clean Text

In [27]:
def cleantext(text):
    text = str(text)
    text=text.split()
    words=[]
    for t in text:
        if t.isalpha():
            words.append(t)
    text=" ".join(words)
    text=text.lower()
    text=re.sub(r"what's","what is ",text)
    text=re.sub(r"it's","it is ",text)
    text=re.sub(r"\'ve"," have ",text)
    text=re.sub(r"i'm","i am ",text)
    text=re.sub(r"\'re"," are ",text)
    text=re.sub(r"n't"," not ",text)
    text=re.sub(r"\'d"," would ",text)
    text=re.sub(r"\'s","s",text)
    text=re.sub(r"\'ll"," will ",text)
    text=re.sub(r"can't"," cannot ",text)
    text=re.sub(r" e g "," eg ",text)
    text=re.sub(r"e-mail","email",text)
    text=re.sub(r"9\\/11"," 911 ",text)
    text=re.sub(r" u.s"," american ",text)
    text=re.sub(r" u.n"," united nations ",text)
    text=re.sub(r"\n"," ",text)
    text=re.sub(r":"," ",text)
    text=re.sub(r"-"," ",text)
    text=re.sub(r"\_"," ",text)
    text=re.sub(r"\d+"," ",text)
    text=re.sub(r"[$#@%&*!~?%{}()]"," ",text)

    return text

In [28]:
for col in bbc_df.columns:
    bbc_df[col] = bbc_df[col].apply(lambda x: cleantext(x))

In [29]:
bbc_df.head()

Unnamed: 0,row_article,summary
0,face fuel price government has confirmed it is...,government has confirmed it is considering rai...
1,fuel prices hit airways has blamed high fuel p...,chief said the results were in a third quarter...
2,calls for fair trade which attends the meeting...,a conference on developing enterprise hosted b...
3,rejects tobacco us government claim accusing t...,us government claim accusing the biggest tobac...
4,takeover talk lifts in uk drinks and food firm...,has reduced the debt it took on to fund the se...


In [30]:
df.head()

Unnamed: 0,row_article,summary
3,"b""Microsoft seeking spyware trojan\n\nMicrosof...","b""Microsoft is investigating a trojan program ..."
4,b'Digital guru floats sub-$100 PC\n\nNicholas ...,"b'He said one laptop per child could be "" very..."
6,b'Wi-fi web reaches farmers in Peru\n\nA netwo...,b'The Agricultural Information Project for Far...
2,b'China net cafe culture crackdown\n\nChinese ...,"b""Laws on net cafe opening hours and who can u..."
5,b'Technology gets the creative bug\n\nThe hi-t...,"b'""We are hoping to understand the creative in..."


In [31]:
len_list =[]
for article in df.row_article:
    words = article.split()
    length = len(words)
    len_list.append(length)
max(len_list)

2969

### 2-2. Tokenizer
1. Tokenize and One-Hot : Tokenizer
2. Vocabraly: article and summary 15000 words
3. Padding: pad_sequences 1000 max_len
4. Reshape: manual max_len * one-hot matrix

In [33]:
import numpy as np
import os
import pandas as pd
import re

In [35]:
bbc_art_sum = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/NLP/Abstract Summarizer Bi-lstm/00_Extra Materials/cleaned_bbc_news.csv")
bbc_art_sum.drop("Unnamed: 0", axis=1, inplace=True)
bbc_art_sum.head()

Unnamed: 0,row_article,summary
0,continues rapid economy has expanded by a brea...,overall investment in fixed assets was still u...
1,deccan seals deccan has ordered airbus planes ...,government has given its backing to cheaper an...
2,job growth continues in us created fewer jobs ...,creation was one of last main concerns for the...
3,owner buys rival for retail giant federated de...,retail giant federated department stores is to...
4,secures giant japan is to supply japan airline...,chose the after carefully considering both it ...


In [36]:
articles = list(bbc_art_sum.row_article)
summaries = list(bbc_art_sum.summary)

### 2-2-1. Tokenize: text_to_word_sequence

In [37]:
from keras.preprocessing.text import Tokenizer
VOCAB_SIZE = 14999
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(articles)
article_sequences = tokenizer.texts_to_sequences(articles)
art_word_index = tokenizer.word_index
len(art_word_index)

23914

In [38]:
print(article_sequences[0][:20])
print(article_sequences[1][:20])
print(article_sequences[2][:20])

[1411, 2338, 248, 16, 3994, 22, 5, 6483, 165, 1359, 50, 966, 4, 120, 967, 176, 118, 505, 38, 2339]
[5211, 8881, 5211, 16, 2233, 3001, 3441, 6, 5, 217, 18, 60, 1270, 7874, 6, 1, 827, 5211, 11, 108]
[478, 196, 1411, 6, 54, 736, 2283, 498, 50, 164, 6, 24, 349, 17, 9, 1, 3322, 6, 5213, 11]


### 2-2-2. Vocabraly: article and summary 15000 words

In [39]:
art_word_index_1500 = {}
counter = 0
for word in art_word_index.keys():
    if art_word_index[word] == 0:
        print("found 0!")
        break
    if art_word_index[word] > VOCAB_SIZE:
        continue
    else:
        art_word_index_1500[word] = art_word_index[word]
        counter += 1

In [40]:
counter

14999

In [41]:
tokenizer.fit_on_texts(summaries)
summary_sequences = tokenizer.texts_to_sequences(summaries)
sum_word_index = tokenizer.word_index
len(sum_word_index)

23929

In [42]:
sum_word_index_1500 = {}
counter = 0
for word in sum_word_index.keys():
    if sum_word_index[word] == 0:
        print("found 0!")
        break
    if sum_word_index[word] > VOCAB_SIZE:
        continue
    else:
        sum_word_index_1500[word] = sum_word_index[word]
        counter += 1

In [43]:
counter

14999

### 2-2-3. Padding: pad_sequences 1000 max_len

In [44]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 1000
pad_art_sequences = pad_sequences(article_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

In [45]:
print(len(article_sequences[1]), len(pad_art_sequences[1]))

243 1000


In [46]:
pad_sum_sequences = pad_sequences(summary_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

In [47]:
print(len(summary_sequences[1]), len(pad_sum_sequences[1]))

90 1000


In [48]:
pad_art_sequences.shape

(2225, 1000)

In [49]:
pad_art_sequences

array([[1411, 2338,  248, ...,    0,    0,    0],
       [5211, 8881, 5211, ...,    0,    0,    0],
       [ 478,  196, 1411, ...,    0,    0,    0],
       ...,
       [ 421, 1337, 2012, ...,    0,    0,    0],
       [2164,  267, 1109, ...,    0,    0,    0],
       [   7,  284,    8, ...,    0,    0,    0]], dtype=int32)

### 2-2-4. Reshape: manual max_len * one-hot matrix

In [51]:
# unused
"""
encoder_inputs = np.zeros((2225, 1000), dtype='float32')
encoder_inputs.shape

decoder_inputs = np.zeros((2225, 1000), dtype='float32')
decoder_inputs.shape

for i, seqs in enumerate(pad_art_sequences):
    for j, seq in enumerate(seqs):
        encoder_inputs[i, j] = seq

for i, seqs in enumerate(pad_sum_sequences):
    for j, seq in enumerate(seqs):
        decoder_inputs[i, j] = seq
"""

"\nencoder_inputs = np.zeros((2225, 1000), dtype='float32')\nencoder_inputs.shape\n\ndecoder_inputs = np.zeros((2225, 1000), dtype='float32')\ndecoder_inputs.shape\n\nfor i, seqs in enumerate(pad_art_sequences):\n    for j, seq in enumerate(seqs):\n        encoder_inputs[i, j] = seq\n\nfor i, seqs in enumerate(pad_sum_sequences):\n    for j, seq in enumerate(seqs):\n        decoder_inputs[i, j] = seq\n"

In [52]:
decoder_outputs = np.zeros((2225, 1000, 15000), dtype='float32')
decoder_outputs.shape

(2225, 1000, 15000)

In [53]:
for i, seqs in enumerate(pad_sum_sequences):
    for j, seq in enumerate(seqs):
        decoder_outputs[i, j, seq] = 1.

In [54]:
decoder_outputs.shape

(2225, 1000, 15000)

### 2-2-5. Pre-trained word2vec and word2vec Matrix

In [None]:
embeddings_index = {}
with open('glove.6B.200d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
def embedding_matrix_creater(embedding_dimention, word_index):
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimention))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
art_embedding_matrix = embedding_matrix_creater(200, word_index=art_word_index_1500)
art_embedding_matrix.shape

(15000, 200)

In [None]:
sum_embedding_matrix = embedding_matrix_creater(200, word_index=sum_word_index_1500)
sum_embedding_matrix.shape

(15000, 200)

In [None]:
encoder_embedding_layer = Embedding(input_dim = 15000,
                                    output_dim = 200,
                                    input_length = MAX_LEN,
                                    weights = [art_embedding_matrix],
                                    trainable = False)

In [None]:
decoder_embedding_layer = Embedding(input_dim = 15000,
                                    output_dim = 200,
                                    input_length = MAX_LEN,
                                    weights = [sum_embedding_matrix],
                                    trainable = False)

In [None]:
sum_embedding_matrix.shape

(15000, 200)

## Step 3. Building Encoder-Decoder Model

In [60]:
from numpy.random import seed
seed(1)

from sklearn.model_selection import train_test_split
import logging

!pip install chart_studio
import chart_studio.plotly as py
#Import the rest of the modules
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import pandas as pd
import pydot


import keras
from keras import backend as k
k.set_learning_phase(1)
from keras.preprocessing.text import Tokenizer
from keras import initializers
from keras.optimizers import RMSprop
from keras.models import Sequential,Model
from keras.layers import Dense,LSTM,Dropout,Input,Activation,Add,concatenate, Embedding, RepeatVector
# from keras.layers.advanced_activations import LeakyReLU,PReLU
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam




In [61]:
from keras.layers import TimeDistributed

In [62]:
# Hyperparams

MAX_LEN = 1000
VOCAB_SIZE =15000
EMBEDDING_DIM = 200
HIDDEN_UNITS = 200
VOCAB_SIZE = VOCAB_SIZE + 1

LEARNING_RATE = 0.002
BATCH_SIZE = 32
EPOCHS = 5

### Model 1. Simple LSTM Encoder-Decoder-seq2seq

In [None]:
"""
Simple LSTM Encoder-Decoder-seq2seq
"""
# encoder
encoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_LSTM = LSTM(HIDDEN_UNITS)(encoder_embedding)
# decoder
decoder_inputs = Input(shape=(MAX_LEN, ))
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_LSTM = LSTM(200)(decoder_embedding)
# merge
merge_layer = concatenate([encoder_LSTM, decoder_LSTM])
decoder_outputs = Dense(units=VOCAB_SIZE+1, activation="softmax")(merge_layer) # SUM_VOCAB_SIZE, sum_embedding_matrix.shape[1]

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

### Model 2. Bidirectional LSTM Encoder-Decoder-seq2seq

In [None]:
"""
Bidirectional LSTM: Others Inspired Encoder-Decoder-seq2seq
"""
encoder_inputs = Input(shape=(MAX_LEN,))
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_LSTM = LSTM(HIDDEN_UNITS, return_state=True)
encoder_LSTM_R = LSTM(HIDDEN_UNITS, return_state=True, go_backwards=True)
encoder_outputs_R, state_h_R, state_c_R = encoder_LSTM_R(encoder_embedding)
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)

final_h = Add()([state_h, state_h_R])
final_c = Add()([state_c, state_c_R])
encoder_states = [final_h, final_c]

"""
decoder
"""
decoder_inputs = Input(shape=(MAX_LEN,))
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_LSTM = LSTM(HIDDEN_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(VOCAB_SIZE, activation='linear')
decoder_outputs = decoder_dense(decoder_outputs)

model= Model(inputs=[encoder_inputs,decoder_inputs], outputs=decoder_outputs)

### Model 3. Chatbot Inspired Encoder-Decoder-seq2seq

In [None]:
"""
Chatbot Inspired Encoder-Decoder-seq2seq
"""
encoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_LSTM = LSTM(HIDDEN_UNITS, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)

decoder_inputs = Input(shape=(MAX_LEN, ), dtype='int32',)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_LSTM = LSTM(HIDDEN_UNITS, return_state=True, return_sequences=True)
decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])

# dense_layer = Dense(VOCAB_SIZE, activation='softmax')
outputs = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)

In [None]:
rmsprop = RMSprop(lr=0.01, clipnorm=1.)
model.compile(loss='mse', optimizer=rmsprop, metrics=["accuracy"])

In [None]:
# model 2
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 200)    3000000     input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 200), (None, 320800      embedding_1[0][0]                
__________________________________________________________________________________________________
lstm_2 (LS

## Step 4. Training your model and Validate it

In [None]:
import numpy as np
num_samples = len(pad_sum_sequences)
decoder_output_data = np.zeros((num_samples, MAX_LEN, VOCAB_SIZE), dtype="int32")

In [None]:
# 3d Tensor of output
for i, seqs in enumerate(pad_sum_sequences):
    for j, seq in enumerate(seqs):
        if j > 0:
            decoder_output_data[i][j][seq] = 1

In [None]:
art_train, art_test, sum_train, sum_test = train_test_split(pad_art_sequences, pad_sum_sequences, test_size=0.2)

In [None]:
train_num = art_train.shape[0]
train_num

1780

In [None]:
target_train = decoder_output_data[:train_num]
target_test = decoder_output_data[train_num:]

In [None]:
history = model.fit([art_train, sum_train],
                     target_train,
                     epochs=EPOCHS,
                     batch_size=BATCH_SIZE,
                     validation_data=([art_test, sum_test], target_test))

Train on 1780 samples, validate on 445 samples
Epoch 1/5

#### Visualization

In [None]:
# Accuracy
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10, 6))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Loss function
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# loading the model
with open('text_summary.json',"w").write(model.to_json())

# loading the weights
model.load_weights('text_summary.h5')
print("Saved Model!")