In [1]:
!pip install transformers
!pip install torch
!pip install gradio
!pip install beautifulsoup4
!pip install requests


Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.114.1-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m607.4 kB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from

In [5]:
import gradio as gr # import Gradio library for creating web-based user interfaces
from transformers import pipeline # import pipeline to use pre-trained models
import torch # import PyTorch library, which is commonly used for Deep Learning tasks
from bs4 import BeautifulSoup # import BeautifulSoup for parsing HTML & XML documnts
import requests # To make HTTP requests to retrieve web content.

In [7]:


def summarize_article(url, min_len, max_len):
  #Create summarization pipeline
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    try:
        # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
        r = requests.get(url)

        # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
        soup = BeautifulSoup(r.text, 'html.parser')

        # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
        results = soup.find_all(['h1','h2','p'])

        # Extract the text content from each element and store it in a list called text
        text = [result.text for result in results]

        # joins all the extracted text into a single string, representing the entire article
        ARTICLE = ' '.join(text)

        # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
        ARTICLE = ARTICLE.replace('.', '.<eos>')
        ARTICLE = ARTICLE.replace('?', '?<eos>')
        ARTICLE = ARTICLE.replace('!', '!<eos>')

        # Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
        sentences = ARTICLE.split('<eos>')

        # Sets the maximum length (in words) for each chunk of text during summarization.
        max_chunk = 500

        # Initializes a variable to keep track of the current chunk being processed
        current_chunk = 0

        # Creates an empty list called chunks to store the individual chunks of text
        chunks = []

        # For loop iterates through each sentence in the sentences list
        '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
        The sentence is added to the current chunk.

        Otherwise:

        The current_chunk index is incremented to move to the next chunk.
        A new chunk is created, and the current sentence becomes the first sentence in this new chunk.

        The current chunk is appended to the chunks list.
        '''
        for sentence in sentences:
            if len(chunks) == current_chunk + 1:
                if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                    chunks[current_chunk].extend(sentence.split(' '))
                else:
                    current_chunk += 1
                    chunks.append(sentence.split(' '))
            else:
                chunks.append(sentence.split(' '))

        ''' After processing all sentences, the loop iterates through each chunk,
        to ensures that each chunk is represented as a single string (rather than a list of words).
        '''
        for chunk_id in range(len(chunks)):
            chunks[chunk_id] = ' '.join(chunks[chunk_id])

        # Apply Summarization to text with lenth of 30-120 word for each chunk
        res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)

        # Extracting the 'summary_text' value from each summary in the res list
        summary = ' '.join([summ['summary_text'] for summ in res])
        return summary

    # Handle potential errors during web request or parsing
    except Exception as e:
        return f"Error: {str(e)}"


# Create Gradio Interface
interface = gr.Interface(
    fn=summarize_article,
    inputs=[
        gr.Textbox(label="Enter the article URL"),
        gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
        gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
    ],
    outputs=gr.Textbox(label="Summary")
)

interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://af08b884a1a1ebb80b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


