In [2]:
import pandas as pd
import numpy as np
import nltk
import json
from tensorflow import keras
import tensorflow as tf

In [3]:
def read_jsonl(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

filename = 'train.jsonl'
jsonl_data = read_jsonl(filename)

df = pd.DataFrame(jsonl_data)

In [5]:
df

Unnamed: 0,id,text,summary
0,gigaword-train-0,australia 's current account deficit shrunk by...,australian current account deficit narrows sha...
1,gigaword-train-1,at least two people were killed in a suspected...,at least two dead in southern philippines blast
2,gigaword-train-2,australian shares closed down #.# percent mond...,australian stocks close down #.# percent
3,gigaword-train-3,south korea 's nuclear envoy kim sook urged no...,envoy urges north korea to restart nuclear dis...
4,gigaword-train-4,south korea on monday announced sweeping tax r...,skorea announces tax cuts to stimulate economy
...,...,...,...
999995,gigaword-train-999995,after proclaiming a special relationship with ...,indian leader vajpayee to meet with bush to di...
999996,gigaword-train-999996,a group of people expelled by the british from...,former residents of indian ocean island demand...
999997,gigaword-train-999997,a mix of profit-taking and cautiousness guided...,stocks lower in early trading
999998,gigaword-train-999998,"hungary 's air carrier , malev , has grounded ...",hungarian air carrier grounds flights to bosnia


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, GRU, LSTM, Dense

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
word_index = tokenizer.word_index

summary_sequences = tokenizer.texts_to_sequences(df['summary'])

In [8]:
maxlen = 15
data_pad = pad_sequences(sequences, maxlen=maxlen)

summary_pad = pad_sequences(summary_sequences, maxlen=maxlen)

embedding_dim = 50
vocab_size = len(word_index) + 1

In [9]:
vocab_size

78846

In [10]:
model_rnn = Sequential([
    Embedding(maxlen, embedding_dim),
    SimpleRNN(32, return_sequences=True),
    Dense(vocab_size, activation='softmax')
])
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [11]:
len(word_index)

78845

In [14]:
model_rnn.fit(data_pad, summary_pad, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1b614b425f0>

In [None]:
model_lstm = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    LSTM(32, return_sequences=True),
    Dense(vocab_size, activation='softmax')
])

In [None]:
model_lstm.fit(data_pad, summary_pad, epochs=20, batch_size=32)

In [None]:
model_gru = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    GRU(32, return_sequences=True),
    Dense(vocab_size, activation='softmax')
])

In [None]:
model_gru.fit(data_pad, summary_pad, epochs=20, batch_size=32a)

In [None]:
predicted_rnn = model_rnn.predict(data_pad)
predicted_gru = model_gru.predict(data_pad)
predicted_lstm = model_lstm.predict(data_pad)

In [None]:
decoded_rnn = []
decoded_gru = []
decoded_lstm = []
for i in range(len(predicted_rnn)):
    decoded_rnn.append(' '.join([key for key, value in word_index.items() if np.argmax(predicted_rnn[i]) == value]))
    decoded_gru.append(' '.join([key for key, value in word_index.items() if np.argmax(predicted_gru[i]) == value]))
    decoded_lstm.append(' '.join([key for key, value in word_index.items() if np.argmax(predicted_lstm[i]) == value]))

In [None]:
print("RNN Summary:", decoded_rnn)
print("GRU Summary:", decoded_gru)
print("LSTM Summary:", decoded_lstm)

In [None]:
references = [[text.split()] for text in data['summary']]
hypotheses_rnn = [text.split() for text in decoded_rnn]
hypotheses_gru = [text.split() for text in decoded_gru]
hypotheses_lstm = [text.split() for text in decoded_lstm]

bleu_rnn = corpus_bleu(references, hypotheses_rnn)