In [1]:
!pip install datasets -qq

In [2]:
import numpy as np
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate
from tensorflow.keras.models import Model
import os
import re
import glob
import string
from collections import Counter
from sklearn.model_selection import train_test_split

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
class Seq2SeqAttention(Model):
    def __init__(self, num_encoder_tokens, num_decoder_tokens, latent_dim):
        super(Seq2SeqAttention, self).__init__()
        self.encoder_embed = Embedding(num_encoder_tokens, latent_dim)
        self.encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
        self.decoder_embed = Embedding(num_decoder_tokens, latent_dim)
        self.decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
        self.attention_layer = Attention()
        self.concat_layer = Concatenate(axis=-1)
        self.decoder_dense = Dense(num_decoder_tokens, activation='softmax')

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs
        encoder_embed = self.encoder_embed(encoder_inputs)
        encoder_outputs, state_h, state_c = self.encoder_lstm(encoder_embed)
        encoder_states = [state_h, state_c]

        decoder_embed = self.decoder_embed(decoder_inputs)
        decoder_outputs, _, _ = self.decoder_lstm(decoder_embed, initial_state=encoder_states)

        attention_result = self.attention_layer([encoder_outputs, decoder_outputs])

        decoder_concat_input = self.concat_layer([decoder_outputs, attention_result])

        decoder_outputs = self.decoder_dense(decoder_concat_input)
        return decoder_outputs


In [5]:
# dataset = load_dataset("xsum", "3.0.0")
# train_dataset = dataset["train"]
# val_dataset = dataset["validation"]
# test_dataset = dataset["test"]
train_dataset, dev_dataset, test_dataset = load_dataset("xsum",split=['train[:10000]', 'validation[:1000]', 'test[:1000]'])



  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
train_dataset, dev_dataset, test_dataset

(Dataset({
     features: ['document', 'summary', 'id'],
     num_rows: 10000
 }), Dataset({
     features: ['document', 'summary', 'id'],
     num_rows: 1000
 }), Dataset({
     features: ['document', 'summary', 'id'],
     num_rows: 1000
 }))

In [7]:
documents = train_dataset[0]["document"]
summary = train_dataset[0]["summary"]

In [8]:
train_stories = [
    {
        'document': clean_text(example['document']),
        'summary': clean_text(example['summary'])
    } 
    for example in train_dataset
]

val_stories = [
    {
        'document': clean_text(example['document']), 
        'summary': clean_text(example['summary'])
    } 
    for example in dev_dataset
]

test_stories = [
    {
        'document': clean_text(example['document']), 
        'summary': clean_text(example['summary'])
    } 
    for example in test_dataset
]

In [9]:
print(f'Training stories: {len(train_stories)}')
print(f'Validation stories: {len(val_stories)}')
print(f'Test stories: {len(test_stories)}')

Training stories: 10000
Validation stories: 1000
Test stories: 1000


In [10]:
batch_size = 64
epochs = 100
latent_dim = 256
num_encoder_tokens = 1000  # Set this to the number of unique tokens in your input language
num_decoder_tokens = 1000  # Set this to the number of unique tokens in your target language
# Set your vocabulary size and maximum sequence length
vocab_size = 10000
max_seq_length = 500

In [11]:
# Initialize tokenizers for documents and summary
document_tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
summary_tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>', filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

In [12]:
# Fit the tokenizers on the training data
document_tokenizer.fit_on_texts([story['document'] for story in train_stories])
summary_tokenizer.fit_on_texts(['<start> ' + story['summary'] + ' <end>' for story in train_stories])

In [13]:
# Convert the text to sequences
document_sequences = document_tokenizer.texts_to_sequences([story['document'] for story in train_stories + val_stories + test_stories])
summary_sequences = summary_tokenizer.texts_to_sequences(['<start> ' + story['summary'] + ' <end>' for story in train_stories + val_stories + test_stories])

In [14]:
mean_doc_len = np.mean([len(doc) for doc in document_sequences])
mean_sum_len = np.mean([len(sum) for sum in summary_sequences])

In [15]:
print(f"Average length of doccument sequences is: {mean_doc_len:.2f} words")
print(f"Average length of summary sequences is: {mean_sum_len:.2f} words")

Average length of doccument sequences is: 378.25 words
Average length of summary sequences is: 23.45 words


In [16]:
# Pad the sequences
encoder_input_data = pad_sequences(document_sequences, maxlen=max_seq_length, padding='post', truncating='post')
decoder_input_data = pad_sequences(summary_sequences, maxlen=50, padding='post', truncating='post')

In [17]:
encoder_input_data.shape

(12000, 500)

In [18]:
decoder_input_data.shape

(12000, 50)

In [19]:
decoder_input_data[0]

array([   4, 3296,   54, 1178,   25, 1853,  251,    2,  119,  962,   10,
       1519,   10, 2680,   17, 1656,  555,   18, 1441, 2965,    3,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [20]:
# Shift the decoder target data by one timestep
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, 0:-1] = decoder_input_data[:, 1:]

In [21]:
decoder_target_data.shape

(12000, 50)

In [22]:
decoder_target_data[0]

array([3296,   54, 1178,   25, 1853,  251,    2,  119,  962,   10, 1519,
         10, 2680,   17, 1656,  555,   18, 1441, 2965,    3,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [23]:
# Convert the target data to one-hot encoding
decoder_target_data = to_categorical(decoder_target_data, num_classes=vocab_size)

In [24]:
decoder_target_data.shape

(12000, 50, 10000)

In [25]:
model = Seq2SeqAttention(num_encoder_tokens, num_decoder_tokens, latent_dim)

# Prepare the input and output tensors
encoder_inputs = Input(shape=(None,))
decoder_inputs = Input(shape=(None,))
outputs = model([encoder_inputs, decoder_inputs])

# Create the final model with the input and output tensors
model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [26]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 seq2_seq_attention (Seq2SeqAtt  (None, None, 1000)  2075624     ['input_1[0][0]',                
 ention)                                                          'input_2[0][0]']                
                                                                                                  
Total params: 2,075,624
Trainable params: 2,075,624
Non-trainable params: 0
__________________

In [27]:
model.get_layer("seq2_seq_attention").summary()

Model: "seq2_seq_attention"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  256000    
                                                                 
 lstm (LSTM)                 multiple                  525312    
                                                                 
 embedding_1 (Embedding)     multiple                  256000    
                                                                 
 lstm_1 (LSTM)               multiple                  525312    
                                                                 
 attention (Attention)       multiple                  0         
                                                                 
 concatenate (Concatenate)   multiple                  0         
                                                                 
 dense (Dense)               multiple           

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=2,
          epochs=epochs,
          validation_split=0.2)

In [None]:
class Seq2SeqInference:
    def __init__(self, model, document_tokenizer, summary_tokenizer, max_seq_length):
        self.model = model
        self.document_tokenizer = document_tokenizer
        self.summary_tokenizer = summary_tokenizer
        self.max_seq_length = max_seq_length

    def generate_summary(self, input_text):
        # Tokenize and pad the input text
        input_seq = self.document_tokenizer.texts_to_sequences([input_text])
        input_seq = pad_sequences(input_seq, maxlen=self.max_seq_length, padding='post', truncating='post')
        
        # Initialize the decoder input with the <start> token
        start_token = self.summary_tokenizer.word_index['<start>']
        end_token = self.summary_tokenizer.word_index['<end>']
        target_seq = np.zeros((1, self.max_seq_length))
        target_seq[0, 0] = start_token

        decoded_sentence = []
        for i in range(1, self.max_seq_length):
            # Get the model prediction
            output_tokens = self.model.predict([input_seq, target_seq])[0, i - 1, :]

            # Sample a token
            sampled_token_index = np.argmax(output_tokens)
            
            # Check if the sampled token is the <end> token or not
            if sampled_token_index == end_token:
                break

            # Add the sampled token to the decoded sentence
            decoded_sentence.append(self.summary_tokenizer.index_word[sampled_token_index])

            # Update the target sequence (of length 1)
            target_seq[0, i] = sampled_token_index

        return ' '.join(decoded_sentence)


In [None]:
inference = Seq2SeqInference(model, document_tokenizer, summary_tokenizer,, max_seq_length)

In [None]:
input_text = "Some example news article text."
generated_summary = inference.generate_summary(input_text)
print("Generated summary:", generated_summary)
