In [13]:
import requests

# going to use URL for frankenstein
book_url = "https://www.gutenberg.org/cache/epub/84/pg84.txt"
response = requests.get(book_url)
book_text = response.text

print(book_text[:500])

﻿The Project Gutenberg eBook of Frankenstein; Or, The Modern Prometheus
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are locate


In [14]:
def chunk_text(text, chunk_size=1000):  # here we estbalish size of each chunk
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

text_chunks = chunk_text(book_text)

print(f"Number of chunks after increasing chunk size: {len(text_chunks)}")
print(text_chunks[2])  # Printing an example chunk

Number of chunks after increasing chunk size: 447
h delight. Do you understand this
feeling? This breeze, which has travelled from the regions towards
which I am advancing, gives me a foretaste of those icy climes.
Inspirited by this wind of promise, my daydreams become more fervent
and vivid. I try in vain to be persuaded that the pole is the seat of
frost and desolation; it ever presents itself to my imagination as the
region of beauty and delight. There, Margaret, the sun is for ever
visible, its broad disk just skirting the horizon and diffusing a
perpetual splendour. There—for with your leave, my sister, I will put
some trust in preceding navigators—there snow and frost are banished;
and, sailing over a calm sea, we may be wafted to a land surpassing in
wonders and in beauty every region hitherto discovered on the habitable
globe. Its productions and features may be without example, as the
phenomena of the heavenly bodies undoubtedly are in those undiscovered
solitud

In [15]:
import warnings
from transformers import pipeline

warnings.filterwarnings("ignore", category=UserWarning)

# pre-trained model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [18]:
summaries = []
max_chunk_length = 1024 

for chunk in text_chunks[:30]: #using 30 chunks, computationally difficult to go through 400+
    
    if len(chunk) > 0:  # Ensure the chunk is not empty
        
        # If the chunk is longer than max_chunk_length, split it into smaller parts
        while len(chunk) > max_chunk_length:
            
            #  im using period as a split point that doesn't cut off sentences 
            
            split_point = chunk.rfind('.', 0, max_chunk_length)  # Find the last period within limit
            
            if split_point == -1:  # split at max_chunk_length (assuming there was no period)
                split_point = max_chunk_length

            try:
                #processing it here 
                summary = summarizer(chunk[:split_point], max_length=150, min_length=40, do_sample=False)
                summaries.append(summary[0]['summary_text'])
            except Exception as e:
                print(f"Error summarizing chunk: {e}") #was for debugging
            
            # Continuing with the remainder of the chunk
            chunk = chunk[split_point:]

        # Process remaining part of the chunk
        if len(chunk) > 0:
            try:
                summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
                summaries.append(summary[0]['summary_text'])
            except Exception as e:
                print(f"Error summarizing chunk: {e}")

print(f"Number of summaries generated: {len(summaries)}")

Number of summaries generated: 30


In [19]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(text_chunks[i], summaries[i]) for i in range(len(summaries))]

avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

print(f"Average ROUGE-1 Score: {avg_rouge1}")
print(f"Average ROUGE-2 Score: {avg_rouge2}")
print(f"Average ROUGE-L Score: {avg_rougeL}")

Average ROUGE-1 Score: 0.3952110314132218
Average ROUGE-2 Score: 0.34309992566916475
Average ROUGE-L Score: 0.3583509549091607
