In [4]:
import re
import heapq
from bs4 import BeautifulSoup
import urllib.request
from stop_words import get_stop_words

In [5]:
stop_words = get_stop_words('en')

In [6]:
def get_article_text(url, tag='p'):
    try:
        scraped_data = urllib.request.urlopen(url)
        article = scraped_data.read()
        parsed_article = BeautifulSoup(article, 'lxml')
        paragraphs = parsed_article.find_all(tag)
        article_text = " ".join([p.text for p in paragraphs])
        return article_text
    except Exception as e:
        print(f"Error fetching or parsing the article: {e}")
        return None

In [7]:
def clean_text(text):
    text = re.sub(r'[[0-9]*]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text.lower()

In [8]:
def calculate_word_frequencies(text):
    word_frequencies = {}
    
    for word in text.split():
        if word not in stop_words:
            word_frequencies[word] = word_frequencies.get(word, 0) + 1
    
    max_frequency = max(word_frequencies.values())
    
    for word in word_frequencies:
        word_frequencies[word] /= max_frequency
    
    return word_frequencies

In [9]:
def generate_summary(sentences, sentence_scores, num_sentences):
    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary

In [10]:
def print_comparison(original_text, summary):
    print("\nOriginal Text:\n", original_text[:1000])  # Print the first 500 characters for brevity
    print("\nSummary:\n", summary)


In [11]:
def main():
    url = 'https://en.wikipedia.org/wiki/Reinforcement_learning'
    tag = 'p'  # Customize the HTML tag for content extraction
    article_text = get_article_text(url, tag)
    
    if article_text:
        article_text = clean_text(article_text)

        sentence_list = article_text.split()

        word_frequencies = calculate_word_frequencies(article_text)

        sentence_scores = {}
        for i, sent in enumerate(sentence_list):
            if len(sent.split(' ')) < 30:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_frequencies.get(sent, 0)

        num_sentences_in_summary = 7  # Customize the length of the summary
        summary = generate_summary(sentence_list, sentence_scores, num_sentences_in_summary)

        # Print the original and summarized text for comparison
        print_comparison(article_text, summary)

In [12]:
if __name__ == "__main__":
    main()


Original Text:
 reinforcement learning  rl  is an interdisciplinary area of machine learning and optimal control concerned with how an intelligent agent ought to take actions in a dynamic environment in order to maximize the cumulative reward  reinforcement learning is one of three basic machine learning paradigms  alongside supervised learning and unsupervised learning  reinforcement learning differs from supervised learning in not needing labelled input output pairs to be presented  and in not needing sub optimal actions to be explicitly corrected  instead the focus is on finding a balance between exploration  of uncharted territory  and exploitation  of current knowledge   the environment is typically stated in the form of a markov decision process  mdp   because many reinforcement learning algorithms for this context use dynamic programming techniques  the main difference between the classical dynamic programming methods and reinforcement learning algorithms is that the latter do 

  text = re.sub(r'[[0-9]*]', ' ', text)
