# Initialize Notebook

In [1]:
import keras
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras import layers as L
from tensorflow.keras import models as M
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tqdm.notebook import tqdm



In [2]:
print(tf.__version__)
print(keras.__version__)

2.10.0
2.10.0


# Load Training File

In [3]:
train = pd.read_csv('D:/AnacondaLibScript/CS5246 Text Mining/Project/CNN_train/train.csv')

## Use only Article size of length less than 2000 and highlights size of length less than 500.

This is too reduce the memory usage

In [4]:
TEXT_SIZE = 1700
SUMM_SIZE = 500

train = train[train['article'].apply(lambda x: len(x)<TEXT_SIZE)]

train = train[train['highlights'].apply(lambda x: len(x)<SUMM_SIZE)]
len(train)

22782

In [5]:
train = train.reset_index().drop(['index','id'], axis=1)

In [6]:
train.head(10)

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,"Kabul, Afghanistan (CNN) -- China's top securi...",China's top security official visited Afghanis...
2,"(CNN) -- Virgin, a leading branded venture cap...",The Virgin Group was founded by Richard Branso...
3,By . Chris Pleasance . Police are hunting for ...,Two men filmed taking iPad from canoe rental o...
4,Baghdad (CNN) -- Radical Iraqi cleric Muqtada ...,Muqtada al-Sadr has been in Iran since 2007 .\...
5,"PUBLISHED: . 07:04 EST, 9 January 2014 . | . U...","Zhu Sanni, 23, had been left alone at home for..."
6,"Kabul, Afghanistan (CNN) -- Thousands of bottl...",Official: Bottles are almost exclusively from ...
7,(CNN) -- Tour de France race director Christia...,The 2013 Tour de France will start from the Fr...
8,(CNN) -- Hundreds filed by a casket on Sunday ...,Wes Leonard collapsed after scoring a winning ...
9,Earlier this season I picked Thierry Henry as ...,Sportsmail columnist Martin Keown was honoured...


To maintain consistent input shapes for the model, sequences are padded with special tokens like <PAD> so that they all have the same length. Additionally, special tokens such as <START> and <END> are added at the beginning and end of the target sequences to clearly define their boundaries. After this preprocessing step, the data is ready to be used for training and inference in the seq2seq model.

In [7]:
X, y = np.array(train.iloc[:, 0:1]), np.array(train.iloc[:,1:2])
X, y = X.reshape(X.shape[0]), y.reshape(y.shape[0])

START = '<start>'
END = '<end>'
PAD = '<PAD>'

y = [f"{START} {text} {END}" for text in y]

In [8]:
size = -20
X_valid, y_valid = X[size:], y[size:]
X, y = X[:size], y[:size]

In [9]:
len(X), len(y)

(22762, 22762)

## Set tokenizer and print vocablulary size

In [10]:
source_token, target_token = Tokenizer(), Tokenizer()
source_token.fit_on_texts(X)
target_token.fit_on_texts(y)
start_id = target_token.word_index.get(START.strip('<>'))
end_id = target_token.word_index.get(END.strip('<>'))
pad_id = 0
in_vocab_size, out_vocab_size = len(source_token.word_index) + 1, len(target_token.word_index) + 1
in_vocab_size, out_vocab_size

(104377, 47213)

## Convert text to sequences, padding and finalizing Encoder Input (encoder_inputs), Decoder Input (decoder_inputs) and Target.

In [11]:
encoder_inputs = source_token.texts_to_sequences(X)
targets = target_token.texts_to_sequences(y)

In [12]:
find_len = lambda x : max([len(seq) for seq in x])+1
input_seq_len, output_seq_len = find_len(encoder_inputs), find_len(targets)
input_seq_len, output_seq_len

(331, 95)

In [13]:
encoder_inputs =np.array(pad_sequences(encoder_inputs, padding='post', truncating='post', maxlen = input_seq_len))

In [14]:
targets = pad_sequences(targets, padding='post', truncating='post', maxlen = output_seq_len)

In [15]:
decoder_inputs = np.array(targets[:, :-1])
targets =  np.array(targets[:, 1:])

In [16]:
in_vocab_size, out_vocab_size, input_seq_len, output_seq_len

(104377, 47213, 331, 95)

## Prepare Attention Mechanism, Encoder and Decoder

In [17]:
class BahdanauAttention(L.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = L.Dense(units)
        self.W2 = L.Dense(units)
        self.V = L.Dense(1)

    def call(self, query, values):
        query = tf.expand_dims(query, axis = 1)                
        score = self.V(tf.nn.tanh(self.W1(query) + self.W2(values)))  
        attention_weight = tf.nn.softmax(score, axis = 1)     
        context = attention_weight*values                      
        context_vector = tf.reduce_sum(context, axis = 1)     
        return context_vector, attention_weight

class Encoder(L.Layer):
    def __init__(self, in_vocab, embedding_dim, hidden_units):
        super(Encoder, self).__init__()
        self.embed = L.Embedding(in_vocab, embedding_dim)     
        self.lstm = L.LSTM(hidden_units, return_sequences=True,return_state = True) 

    def call(self, inputs):
        x = self.embed(inputs)                             
        enc_out, hidden_state, cell_state = self.lstm(x)    
        return enc_out, hidden_state, cell_state

class Decoder(L.Layer):
    def __init__(self, out_vocab, embedding_dim, hidden_units):
        super(Decoder, self).__init__()
        self.embed = L.Embedding(out_vocab, embedding_dim)  
        self.lstm = L.LSTM(hidden_units, return_sequences = True, return_state = True) 
        self.dense = L.Dense(out_vocab, activation='softmax')
        self.attention = BahdanauAttention(64)
    
    def call(self, inputs, hidden_state, cell_state, enc_output):
        x = self.embed(inputs)                               
        states = [hidden_state, cell_state] 
        context, attention_weights = self.attention(query = hidden_state, values = enc_output)
        dec_out, hidden_state, cell_state = self.lstm(x, initial_state=states)  
        dec_out = tf.squeeze(dec_out, axis=1)                 
      
        inputs = tf.concat([context, dec_out], axis=-1)        
        out = self.dense(inputs)                              
        return out, hidden_state, cell_state 

## Prepare Seq2Seq model

In [18]:
class Seq2Seq(M.Model):

    def __init__(self, in_vocab, out_vocab, embedding_dim, hidden_units, end_token):
        super(Seq2Seq, self).__init__()

        self.in_vocab = in_vocab
        self.out_vocab = out_vocab
        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units
        
        self.encoder = Encoder(in_vocab, embedding_dim, hidden_units)
        self.decoder = Decoder(out_vocab, embedding_dim, hidden_units)
        self.end_token = end_token
    
    @tf.function
    def train_step(self, inputs):
        (enc_inputs, dec_inputs), targets = inputs        
        
        with tf.GradientTape() as tape:
            enc_out, hidden_state, cell_state = self.encoder(enc_inputs)          
            seq_len = dec_inputs.shape[1]
            dec_out = tf.TensorArray(tf.float32, seq_len)
            mask = tf.TensorArray(tf.bool, size=seq_len)
            for timestep in tf.range(seq_len):
                timestep_input = dec_inputs[:, timestep:timestep+1]      
                timestep_output, hidden_state, cell_state = self.decoder(timestep_input, hidden_state, cell_state, enc_out)   
                dec_out = dec_out.write(timestep, timestep_output)
                is_end = tf.equal(targets[:, timestep], self.end_token) 
                mask = mask.write(timestep, tf.logical_not(is_end))
            dec_out = tf.transpose(dec_out.stack(), [1, 0, 2])
            sequence_mask = tf.transpose(mask.stack(), [1, 0])
            loss = self.compiled_loss(targets, dec_out, sample_weight=tf.cast(sequence_mask, tf.float32))   
        variables = self.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        self.compiled_metrics.update_state(targets, dec_out) 
        return {m.name : m.result() for m in self.metrics}

    @tf.function
    def call(self, inputs, training=False):
        enc_inputs, dec_inputs = inputs                       
        enc_out, hidden_state, cell_state = self.encoder(enc_inputs)   
        seq_len = tf.shape(dec_inputs)[1]
        dec_out = tf.TensorArray(tf.float32, seq_len)  
        for timestep in tf.range(seq_len):
            timestep_input = dec_inputs[:, timestep:timestep+1]       
            timestep_output, hidden_state, cell_state = self.decoder(timestep_input, hidden_state, cell_state, enc_out)  
            dec_out = dec_out.write(timestep, timestep_output)
        return tf.transpose(dec_out.stack(), [1, 0, 2])
    
    def generate(self, enc_inputs, max_len, start, end):
        enc_out, hidden_state, cell_state = self.encoder(enc_inputs)
        dec_in = tf.expand_dims([start], 0)             
        result = []
        for _ in range(max_len): 
            prediction_logits, hidden_state, cell_state = self.decoder(dec_in, hidden_state, cell_state, enc_out) 
            prediction = tf.argmax(prediction_logits, axis=-1)        
            if prediction == end:
                break
            result.append(prediction.numpy())
            dec_in = tf.expand_dims(prediction, 0) 
        return result

    
    def get_config(self):
        config = super(Seq2Seq, self).get_config()
        config.update({
              'in_vocab': self.in_vocab,
              'out_vocab': self.out_vocab,
              'embedding_dim': self.embedding_dim,
              'hidden_units': self.hidden_units
          })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(
            in_vocab=config['in_vocab'],
            out_vocab=config['out_vocab'],
            embedding_dim=config['embedding_dim'],
            hidden_units=config['hidden_units']
        )

In [19]:
model = Seq2Seq(in_vocab=in_vocab_size, out_vocab=out_vocab_size, embedding_dim=512, hidden_units=512, end_token=end_id)

In [20]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## Training Model

In [21]:
model.fit((enc_inputs, dec_inputs), targets, batch_size=32, epochs=40, validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x236387488b0>

In [22]:
model.save('D:/AnacondaLibScript/CS5246 Text Mining/Project/0504_40Epoch')



INFO:tensorflow:Assets written to: D:/AnacondaLibScript/CS5246 Text Mining/Project/0504_2k_40Epoch\assets


INFO:tensorflow:Assets written to: D:/AnacondaLibScript/CS5246 Text Mining/Project/0504_2k_40Epoch\assets


In [22]:
model= tf.keras.models.load_model('D:/AnacondaLibScript/CS5246 Text Mining/Project/0504_40Epoch')

## Model Inference

In [23]:
word_dict = {v : k for k,v in target_token.word_index.items()}

In [24]:
def generate_summary(model, enc_inputs, max_len, start, end):
    enc_out, hidden_state, cell_state = model.encoder(enc_inputs)
    dec_in = tf.expand_dims([start], 0)
    dec_in = tf.cast(dec_in, tf.int32)
    result = []
    for _ in range(max_len): 
        prediction_logits, hidden_state, cell_state = model.decoder(dec_in, hidden_state, cell_state, enc_out)
        prediction = tf.argmax(prediction_logits, axis=-1)
        if prediction == end:
            break
        result.append(prediction.numpy())
        dec_in = tf.expand_dims(prediction, 0)
        dec_in = tf.cast(dec_in, tf.int32)
    return result

In [25]:
def summarize(ind, model=model, source_tokenizer=source_token, target_tokenizer=target_token, source_max=input_seq_len, target_max=output_seq_len):
    text = source_tokenizer.texts_to_sequences([X[ind]])
    text = pad_sequences(text, maxlen=source_max, padding='post')
    text = tf.cast(text, tf.int32)
    model_output = generate_summary(model, text, output_seq_len, start_id, end_id)
    output_text = []
    for token_id in model_output:
        token_id = token_id[0] 
        if token_id == end_id:
            break
        word = word_dict.get(token_id, '')
        if word:
            output_text.append(word)
    print("Input Text")
    print(X[ind])
    print('\nInference')
    print(' '.join(output_text))
    print('\nExpected Highlights')
    print(y[ind][7:-5])

In [30]:
for i in range(3200,3206):  
    summarize(i, model) 
    print("-" * 50)

Input Text
(CNN) -- A passenger who landed at Tokyo's Narita airport over the weekend has ended up with a surprise souvenir courtesy of customs officials -- a package of cannabis. Sniffer dogs failed to find the cannabis after it had been slipped into a passenger's bag. A customs official hid the package in a suitcase belonging to a passenger arriving from Hong Kong as part of an exercise for sniffer dogs on Sunday, Reuters.com reported. However, staff then lost track of the drugs and suitcase during the exercise, a spokeswoman for Tokyo customs said. Customs regulations specify that a training suitcase be used for such exercises, but the official had used passengers' suitcases for similar purposes in the past, domestic media reported. Tokyo customs has asked anyone who finds the package to return it.

Inference
customs official slips cannabis into passenger's bag to test sniffer dogs cannabis slips through the net with officials forced to ask for its return cannabis hidden in bag of u

## Metric Evaluation

In [26]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
from tqdm.notebook import tqdm

In [31]:
test_dataset = pd.read_csv('D:/AnacondaLibScript/CS5246 Text Mining/Project/CNN_test/test.csv')
test_dataset = test_dataset[test_dataset['article'].apply(lambda x: len(x)<TEXT_SIZE)]

test_dataset = test_dataset[test_dataset['highlights'].apply(lambda x: len(x)<SUMM_SIZE)]

len(test_dataset)

11490

In [32]:
X_test, y_test = np.array(test_dataset.iloc[:, 0:1]), np.array(test_dataset.iloc[:,1:2])
X_test, y_test = X_test.reshape(X_test.shape[0]), y_test.reshape(y_test.shape[0])

START = '<start>'
END = '<end>'
PAD = '<PAD>'

y_test = [f"{START} {text} {END}" for text in y_test]

In [33]:
def evaluate_metrics(model, test_df, source_texts, target_texts, 
                    source_tokenizer, target_tokenizer, 
                    word_dict, start_id, end_id,
                    source_max_len, target_max_len,
                    num_samples=20):
    """
    Calculate BLEU and ROUGE scores for model-generated summaries
    
    Args:
        model: Your trained Seq2Seq model
        test_df: Pandas DataFrame containing test data
        source_texts: List of source texts (X_test)
        target_texts: List of target texts (y_test)
        source_tokenizer: Tokenizer for source texts
        target_tokenizer: Tokenizer for target texts
        word_dict: Dictionary mapping word IDs to words
        start_id: ID of start token
        end_id: ID of end token
        source_max_len: Maximum length of source sequences
        target_max_len: Maximum length of target sequences
        num_samples: Number of samples to evaluate
    """
    # Initialize metrics
    smooth = SmoothingFunction().method1
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    all_bleu, all_rouge1, all_rouge2, all_rougeL = [], [], [], []
    
    # Select samples (or use all if num_samples is None)
    if num_samples is None:
        num_samples = len(test_df)
    sample_indices = range(min(num_samples, len(test_df)))
    
    for i in tqdm(sample_indices, desc="Evaluating"):
        # Get reference and generated summaries
        reference = target_texts[i][7:-5]  # Remove <start> and <end> tags
        
        # Generate model prediction
        text = source_tokenizer.texts_to_sequences([source_texts[i]])
        text = pad_sequences(text, maxlen=source_max_len, padding='post')
        text = tf.cast(text, tf.int32)
        
        model_output = generate_summary(model, text, target_max_len, start_id, end_id)
        
        # Convert model output to text
        generated = []
        for token_id in model_output:
            token_id = token_id[0]
            if token_id == end_id:
                break
            word = word_dict.get(token_id, '')
            if word:
                generated.append(word)
        generated = ' '.join(generated)
        
        # Tokenize for BLEU (split into words)
        ref_tokens = [reference.split()]
        gen_tokens = generated.split()
        
        # Calculate BLEU (using sentence_bleu since we're comparing one sentence at a time)
        bleu = sentence_bleu(ref_tokens, gen_tokens, smoothing_function=smooth)
        all_bleu.append(bleu)
        
        # Calculate ROUGE
        scores = scorer.score(reference, generated)
        all_rouge1.append(scores['rouge1'].fmeasure)
        all_rouge2.append(scores['rouge2'].fmeasure)
        all_rougeL.append(scores['rougeL'].fmeasure)
    
    return {
        'BLEU': np.mean(all_bleu),
        'ROUGE-1': np.mean(all_rouge1),
        'ROUGE-2': np.mean(all_rouge2),
        'ROUGE-L': np.mean(all_rougeL),
        'num_samples': len(all_bleu)
    }

In [30]:
metrics = evaluate_metrics(
    model=model,
    test_df=test_dataset,  # or your test dataframe
    source_texts=X_test,
    target_texts=y_test,
    source_tokenizer=source_token,
    target_tokenizer=target_token,
    word_dict=word_dict,
    start_id=start_id,
    end_id=end_id,
    source_max_len=input_seq_len,
    target_max_len=output_seq_len,
    num_samples=None  # evaluate on 20 samples or set to None for all
)
print(f"1700 40")
print(f"BLEU: {metrics['BLEU']:.4f}")
print(f"ROUGE-1: {metrics['ROUGE-1']:.4f}")
print(f"ROUGE-2: {metrics['ROUGE-2']:.4f}")
print(f"ROUGE-L: {metrics['ROUGE-L']:.4f}")
print(f"Evaluated on {metrics['num_samples']} samples")

Evaluating:   0%|          | 0/1088 [00:00<?, ?it/s]

1700 40
BLEU: 0.0034
ROUGE-1: 0.2372
ROUGE-2: 0.0347
ROUGE-L: 0.1362
Evaluated on 1088 samples
