In [2]:
import numpy as np
import pandas as pd

import re
import string
import csv
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding, Concatenate, TimeDistributed, Bidirectional, GRU
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils.vis_utils import plot_model
! pip install rouge
from rouge import Rouge
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#train.csv
train_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')

In [4]:
#validation csv
val_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv')

In [5]:
#test csv
test_df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv')

In [6]:
print('train:',train_df.shape, '\nvalidation:', val_df.shape,'\ntest:', test_df.shape)

train: (287113, 3) 
validation: (13368, 3) 
test: (11490, 3)


In [7]:
test_df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


## Checking Null Values

In [8]:
train_df.isna().sum()

id            0
article       0
highlights    0
dtype: int64

## Checking Duplicates

In [9]:
train_df.duplicated(subset=['article', 'highlights']).sum()

3098

In [10]:
train_df = train_df.drop_duplicates(subset= ['article', 'highlights'])

In [11]:
train_df = train_df.drop(['id'], axis = 1)
train_df = train_df.reset_index(drop=True)

In [12]:
val_df = val_df.drop(['id'], axis = 1)
val_df = val_df.reset_index(drop=True)

In [13]:
test_df = test_df.drop(['id'], axis = 1)
test_df = test_df.reset_index(drop=True)

# Applying Extractive Summarization
### To get smaller inputs for seq2seq model and only using the most important sentences.

In [14]:
train_df.head()

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [24]:
test_df.shape

(11490, 2)

In [25]:
# Taking only 500 instances of testing data since we used the same for intermediate report.
small_df = pd.DataFrame()
small_df['text'] = test_df['article'][:11490]
small_df['summary'] = test_df['highlights'][:11490]

In [26]:
small_df.head()

Unnamed: 0,text,summary
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [27]:
small_df['text'][0]

"Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.\xa0'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for sp

In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import nltk
nltk.download('punkt')

nltk_generated_summaries=[]

#text = origin_text
#original_summary = origin_summary
#original_summary = origin_summary

# Next, you need to tokenize the text:
for i in range(len(small_df)):
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(small_df['text'][i])

  # Now we create a frequency table to keep a score of each word:

    freqTable = dict()
    for word in words:
        word = word.lower()
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1
    sentences = sent_tokenize(small_df['text'][i])
    sentenceValue = dict()

    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
                if word in sentence.lower():
                    if sentence in sentenceValue:
                        sentenceValue[sentence] += freq
                    else:
                        sentenceValue[sentence] = freq

    sumValues = 0
    for sentence in sentenceValue:
        sumValues += sentenceValue[sentence]

  # defining the average value from the original text as such:

    average = int(sumValues / len(sentenceValue))

  # now storing the sentences into our summary:

    summary = ''

    for sentence in sentences:
        if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):
            summary += " " + sentence
    nltk_generated_summaries.append(summary)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
df_extractive_generated = pd.DataFrame(columns=['article', 'summary_generated'])
df_extractive_generated['article']=small_df['text']
df_extractive_generated['summary_generated']=nltk_generated_summaries
df_extractive_generated.head()

Unnamed: 0,article,summary_generated
0,Ever noticed how plane seats appear to be gett...,"This week, a U.S consumer advisory group set ..."
1,A drunk teenage boy had to be rescued by secur...,"Rahul Kumar, 17, clambered over the enclosure..."
2,Dougie Freedman is on the verge of agreeing a ...,Freedman has stabilised Forest since he repla...
3,Liverpool target Neto is also wanted by PSG an...,Liverpool target Neto is also wanted by PSG a...
4,Bruce Jenner will break his silence in a two-h...,"The former Olympian and reality TV star, 65, ..."


In [32]:
with open('/kaggle/working/result_extractive_nltk.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Article', 'Original Summary', 'Model Output'])
    for i in range(11490):
        writer.writerow([df_extractive_generated.article[i], small_df.summary[i], df_extractive_generated.summary_generated[i]])

In [33]:
rouge = Rouge()
rouge.get_scores(nltk_generated_summaries, small_df['summary'], avg=True, ignore_empty=True)

{'rouge-1': {'r': 0.5701920292168526,
  'p': 0.1857392295771334,
  'f': 0.26429063736310904},
 'rouge-2': {'r': 0.24318744319483168,
  'p': 0.06681269810396065,
  'f': 0.09730580076169715},
 'rouge-l': {'r': 0.5331721041073377,
  'p': 0.17291010331051787,
  'f': 0.2463826396902989}}