In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization

In [None]:
df = pd.read_csv("article_highlights.csv")
df = df[['article', 'highlights']].dropna()

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub('"', '', text)
    text = re.sub(r'[^a-zA-Z?.!,\d]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df['article'] = df['article'].apply(preprocess_text)
df['highlights'] = df['highlights'].apply(preprocess_text)

In [None]:
# Tokenizer for input text
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(df['article'])
X_train = text_tokenizer.texts_to_sequences(df['article'])
max_article_length = pd.Series(X_train).map(len).max()+1
X_train_padded = pad_sequences(X_train, maxlen=max_article_length, padding='post')

In [None]:
#df['highlights'] = df['highlights'].apply(lambda s: f"startofseq {s} endofseq")
df['highlights'] = "startofseq " + df['highlights'] + " endofseq"

summary_tokenizer = Tokenizer()
summary_tokenizer.fit_on_texts(df['highlights'])
Y_train = summary_tokenizer.texts_to_sequences(df['highlights'])
max_highlights_length = pd.Series(Y_train).map(len).max()+1
Y_train_padded = pad_sequences(Y_train, maxlen=max_highlights_length, padding='post')
print(Y_train_padded.shape)

(8165, 57)


In [None]:
text_vocab_size = len(text_tokenizer.word_index) + 1
summary_vocab_size = len(summary_tokenizer.word_index) + 1

In [None]:
from tensorflow.keras import layers
class PositionalEncoding(layers.Layer):
    def __init__(self, seq_len, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(seq_len, d_model)

    def positional_encoding(self, seq_len, d_model):
        position = tf.range(seq_len, dtype=tf.float32)[:, tf.newaxis]
        div_term = tf.exp(tf.range(0, d_model, 2, dtype=tf.float32) * -(tf.math.log(10000.0) / d_model))
        sin_vals = tf.math.sin(position * div_term)
        cos_vals = tf.math.cos(position * div_term)
        pos_encoding = tf.concat([sin_vals, cos_vals], axis=-1)
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, inputs):
        # Ensure that positional encoding matches input dimensions
        batch_size = tf.shape(inputs)[0]
        sequence_length = tf.shape(inputs)[1]
        d_model = tf.shape(inputs)[2]  # Ensure matching feature size

        pos_encoding_resized = self.pos_encoding[:sequence_length, :d_model]  # Adjust to match feature dimensions
        return inputs + tf.expand_dims(pos_encoding_resized, axis=0)  # Expand for batch dimension


In [None]:
# Model parameters
d_model = 25 # Embedding size
num_heads = 8  # Number of attention heads
dff = 256  # Feedforward network size
from tensorflow.keras import layers
# Encoder
encoder_inputs = Input(shape=(X_train_padded.shape[1],))  # (Batch, Time Steps, Features)
encoder_embedding = Embedding(text_vocab_size, d_model, mask_zero=True)(encoder_inputs)
pos_encoding_enc = PositionalEncoding(max_article_length,d_model)
encoder_inputs_with_pos = pos_encoding_enc(encoder_embedding)
attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(encoder_inputs_with_pos, encoder_inputs_with_pos)
attention = layers.Dropout(0.2)(attention)
attention = layers.LayerNormalization(epsilon=1e-6)(encoder_inputs_with_pos + attention)
encoder_outputs = layers.Dense(256, activation='relu')(attention)
encoder_outputs = layers.Dense(d_model)(encoder_outputs)
encoder_outputs = layers.LayerNormalization(epsilon=1e-6)(attention + encoder_outputs)



In [None]:
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(summary_vocab_size, d_model, mask_zero=True)(decoder_inputs)
pos_encoding_dec = PositionalEncoding(max_highlights_length, d_model)
decoder_inputs_with_pos = pos_encoding_dec(decoder_embedding)
attention1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(decoder_inputs_with_pos, decoder_inputs_with_pos)
attention1 = layers.Dropout(0.2)(attention1)
attention1 = layers.LayerNormalization(epsilon=1e-6)(decoder_inputs_with_pos + attention1)
attention2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(attention1, encoder_outputs)  # Attend to encoder output
attention2 = layers.Dropout(0.2)(attention2)
attention2 = layers.LayerNormalization(epsilon=1e-6)(attention1 + attention2)
decoder_outputs = layers.Dense(256, activation='relu')(attention2)
decoder_outputs = layers.Dense(d_model)(decoder_outputs)
decoder_outputs = layers.Dropout(0.2)(decoder_outputs)
decoder_outputs = layers.LayerNormalization(epsilon=1e-6)(attention2 + decoder_outputs)



In [None]:
final_outputs = layers.Dense(summary_vocab_size, activation='softmax')(decoder_outputs)  # Output layer
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=final_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# decoder input and output data
decoder_input_data = Y_train_padded[:, :-1]
decoder_output_data = Y_train_padded[:, 1:]

In [None]:
# Training
history = model.fit(
    [X_train_padded, decoder_input_data],  # Encoder input and decoder input
    decoder_output_data,                   # Decoder output
    epochs=100
)

Epoch 1/100
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 220ms/step - accuracy: 0.5374 - loss: 4.2454
Epoch 2/100
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 216ms/step - accuracy: 0.8638 - loss: 1.0028
Epoch 3/100
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 221ms/step - accuracy: 0.9652 - loss: 0.3078
Epoch 4/100
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 217ms/step - accuracy: 0.9873 - loss: 0.1250
Epoch 5/100
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 219ms/step - accuracy: 0.9936 - loss: 0.0668
Epoch 6/100
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 219ms/step - accuracy: 0.9961 - loss: 0.0432
Epoch 7/100
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 222ms/step - accuracy: 0.9982 - loss: 0.0252
Epoch 8/100
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 227ms/step - accuracy: 0.9990 - loss: 0.0163
Epoch 9/

In [None]:
import numpy as np
import tensorflow as tf

def generate_summary(input_text, tokenizer, max_length, model, start_token, end_token):

    # Tokenize the input text and pad
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_article_length, padding='post')

    # Start decoding with the start token
    output_seq = [start_token]

    for _ in range(max_length):
        # Predict next token
        predictions = model.predict([input_seq, np.array([output_seq])], verbose=0)

        # Get the token with highest probability
        next_token = np.argmax(predictions[0, -1, :])

        # Stop if end token is generated
        if next_token == end_token:
            break

        # Append the next token to output sequence
        output_seq.append(next_token)

    # Decode tokens back to text
    summary = tokenizer.sequences_to_texts([output_seq])[0]

    return summary


In [None]:
print(generate_summary(
    "Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? ",
    tokenizer=summary_tokenizer,
    max_length=50,
    model=model,
    start_token=summary_tokenizer.word_index["startofseq"],
    end_token=summary_tokenizer.word_index["endofseq"]
))


startofseq jenna jenna are jayden eynaud off her baby bump jayden eynaud was edited out of the show at the 77th ee british academy film awards full season
