In [26]:
#importing Natural language toolkit
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
import nltk
nltk.download('stopwords')
import requests
from bs4 import BeautifulSoup
import re
from nltk import sent_tokenize
from nltk.corpus import stopwords  # Import NLTK stopwords
from collections import defaultdict

# Define the URL to scrape
url = "https://en.wikipedia.org/wiki/Alexander_the_Great"

# Send a request to the URL and get the HTML content
response = requests.get(url)
html_content = response.content

# Create BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Initialize a dictionary to store headings and corresponding paragraphs
headings_paragraphs = defaultdict(list)

def clean_text(text):
    # Remove numbers, spaces, symbols, and brackets
    cleaned_text = re.sub(r'\d+', '', text)  # Remove numbers
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra spaces
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  # Remove symbols and brackets
    return cleaned_text.strip()

# Retrieve headings and corresponding paragraphs
current_heading = None  # Initialize current_heading
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
    if element.name.startswith('h'):
        current_heading = element.get_text()
    elif current_heading and element.name == 'p':
        cleaned_paragraph = clean_text(element.get_text())
        if cleaned_paragraph:
            headings_paragraphs[current_heading].append(cleaned_paragraph)

# Function to remove stopwords from a list of words
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return [word for word in text if word.lower() not in stop_words]

# Function to summarize paragraphs
def summarize_paragraphs(paragraphs, num_sentences=2):
    summaries = []
    for paragraph in paragraphs:
        sentences = sent_tokenize(paragraph)
        sentences = [remove_stopwords(nltk.word_tokenize(sentence)) for sentence in sentences]  # Remove stopwords
        sentences = [' '.join(sentence) for sentence in sentences]  # Recreate sentences
        summary = ' '.join(sentences[:num_sentences])
        summaries.append(summary)
    return summaries

# Generate and print summaries while retaining headings
for heading, paragraphs in headings_paragraphs.items():
    print("\033[1m" + heading + "\033[0m")  # Highlighting the heading
    summaries = summarize_paragraphs(paragraphs)
    for summary in summaries:
        print(summary)
    print()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[1mAlexander the Great[0m
Alexander III Macedon Ancient Greek Ἀλέξανδρος romanized Alexandros July BC June BC commonly known Alexander Greata king ancient Greek kingdom Macedona succeeded father Philip II throne BC age spent ruling years conducting lengthy military campaign throughout Western Asia Egypt age created one largest empires history stretching Greece northwestern India undefeated battle widely considered one historys greatest successful military commanders
age Alexander tutored Aristotle BC shortly assumption kingship Macedon campaigned Balkans reasserted control Thrace parts Illyria marching city Thebes subsequently destroyed battle Alexander led League Corinth used authority launch panHellenic project envisaged father assuming leadership Greeks conquest Persia
BC invaded Achaemenid Persian Empire began series campaigns lasted years Following conquest Asia Minor Alexander broke power Achaemenid Persia series decisive battles including Issus Gaugamela subsequently overthrew

In [28]:
def generate_summarization_prompts(headings_paragraphs):
    prompts = []
    for heading, paragraphs in headings_paragraphs.items():
        for paragraph in paragraphs:
            prompt = f"Summarize the content under the heading: '{heading}'. Content: {paragraph}"
            prompts.append(prompt)
    return prompts

# Generate and print summarization prompts
prompts = generate_summarization_prompts(headings_paragraphs)
for i, prompt in enumerate(prompts, start=1):
    print(f"Prompt {i}: {prompt}\n")


Prompt 1: Summarize the content under the heading: 'Alexander the Great'. Content: Alexander III of Macedon Ancient Greek Ἀλέξανδρος romanized Alexandros  July BC   June BC commonly known as Alexander the Greata was a king of the ancient Greek kingdom of Macedona He succeeded his father Philip II to the throne in BC at the age of  and spent most of his ruling years conducting a lengthy military campaign throughout Western Asia and Egypt By the age of  he had created one of the largest empires in history stretching from Greece to northwestern India He was undefeated in battle and is widely considered to be one of historys greatest and most successful military commanders

Prompt 2: Summarize the content under the heading: 'Alexander the Great'. Content: Until the age of  Alexander was tutored by Aristotle In BC shortly after his assumption of kingship over Macedon he campaigned in the Balkans and reasserted control over Thrace and parts of Illyria before marching on the city of Thebes wh