In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import heapq
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
!pip install spacy



In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [15]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def F_first_K_Sents(tokenized_sents, f):
    # Select the first 'f' sentences
    return tokenized_sents[:f]

def SelectImpSentences(similarity_matrix, N, tokenized_sents):
    # Calculate scores for each sentence
    scores = [sum(similarity_matrix[i]) for i in range(len(similarity_matrix))]

    # Select the top 'N' sentences based on scores
    selected_indices = heapq.nlargest(N, range(len(scores)), key=scores.__getitem__)
    selected_sentences = [tokenized_sents[i] for i in selected_indices]
    return selected_sentences

def extractiveApproach(dataset, f, N):
    output_dataset = []

    for input_text in dataset:
        important_sentences = []

        tokenized_sents = nltk.sent_tokenize(input_text)
        tokenized_sents = [nltk.word_tokenize(sent) for sent in tokenized_sents]
        tokenized_sents = [remove_stopwords(tokens) for tokens in tokenized_sents]

        # Select the first 'f' sentences
        first_k_sents = F_first_K_Sents(tokenized_sents, f)

        # Calculate sentence embeddings and similarity matrix
        sentence_embeddings = [np.mean([token.vector for token in nlp(' '.join(tokens))], axis=0) for tokens in first_k_sents]

        similarity_matrix = cosine_similarity(sentence_embeddings, sentence_embeddings)

        # Calculate scores for each sentence and select the top 'N' sentences
        selected_sentences = SelectImpSentences(similarity_matrix, N, first_k_sents)

        summary = ' '.join([' '.join(sent) for sent in selected_sentences])
        output_dataset.append(summary)

    return output_dataset

In [19]:
dataset = [
    "The Painted Table was more than fifty feet long, perhaps half that wide at its widest point, but less than four feet across at its narrowest.",
    "The pale sword came shivering through the air.",
    "They were on the far side when they heard the howl, a long rising wail that moved through the trees like a cold wind.",
    "In extractive text summarization, important sentences are selected from the original text to form the summary.",
    "In this example, we'll use an extractive approach to summarize these sentences.",
]

f = 3  # Number of sentences to consider
N = 2  # Number of sentences to select

summarized_dataset = extractiveApproach(dataset, f, N)
for i, summary in enumerate(summarized_dataset):
    print(f"Summary {i + 1}: {summary}")


Summary 1: Painted Table fifty feet long , perhaps half wide widest point , less four feet across narrowest .
Summary 2: pale sword came shivering air .
Summary 3: far side heard howl , long rising wail moved trees like cold wind .
Summary 4: extractive text summarization , important sentences selected original text form summary .
Summary 5: example , 'll use extractive approach summarize sentences .


In [20]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/NLP/Data/test.csv")
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [21]:
df = df[:1000]

In [23]:
summarized_dataset = extractiveApproach(df['highlights'], f, N)

df["Summary"] = summarized_dataset

df.head()

Unnamed: 0,id,article,highlights,Summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,Safety tests conducted planes leg room airline...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,Drunk teenage boy climbed lion enclosure zoo w...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,Fiorentina goalkeeper Neto linked Liverpool Ar...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",interview also one Diane Sawyer 's first appea...


In [25]:
#drop "id" and "article" column
df.drop(columns = ["id", "article"])

Unnamed: 0,highlights,Summary
0,Experts question if packed out planes are put...,Safety tests conducted planes leg room airline...
1,Drunk teenage boy climbed into lion enclosure ...,Drunk teenage boy climbed lion enclosure zoo w...
2,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...
3,Fiorentina goalkeeper Neto has been linked wit...,Fiorentina goalkeeper Neto linked Liverpool Ar...
4,"Tell-all interview with the reality TV star, 6...",interview also one Diane Sawyer 's first appea...
...,...,...
995,Transport for London used actors in the uncomf...,Transport London used actors uncomfortable cam...
996,WARNING: GRAPHIC CONTENT .\nThe week-long fest...,"week-long festival marks trial , crucifixion r..."
997,Floyd Mayweather and Manny Pacquiao fight in L...,Singer Mariah Carey paid visit Mayweather 's b...
998,ComRes survey for ITV shows Ukip falling behin...,ComRes survey ITV shows Ukip falling behind ke...


In [26]:
df.to_csv('summarized-data.csv', index = False)

In [27]:
#calculating the rouge score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

def calculate_rouge_scores(generated_summaries, reference_summaries):
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    for generated, reference in zip(generated_summaries, reference_summaries):
        # Tokenize the generated and reference summaries
        gen_tokens = word_tokenize(generated)
        ref_tokens = word_tokenize(reference)

        # Calculate ROUGE-N scores
        rouge_1_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(1, 0, 0), smoothing_function=SmoothingFunction().method1))
        rouge_2_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(0.5, 0.5, 0), smoothing_function=SmoothingFunction().method1))

        # Calculate ROUGE-L score
        rouge_l_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(0, 1, 0), smoothing_function=SmoothingFunction().method1))

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_rouge_1, avg_rouge_2, avg_rouge_l

# Example usage
generated_summaries = df['Summary']
reference_summaries = df['highlights']

rouge_1, rouge_2, rouge_l = calculate_rouge_scores(generated_summaries, reference_summaries)
print("ROUGE-1 Score:", rouge_1)
print("ROUGE-2 Score:", rouge_2)
print("ROUGE-L Score:", rouge_l)

ROUGE-1 Score: 0.24218934480398008
ROUGE-2 Score: 0.19478248355097652
ROUGE-L Score: 0.15766847984180746
