##Extraction-Based Summarizer Scraped Wikipedia articles using Beautiful Soup

In [1]:
import bs4 as bs
import urllib.request
import re
import nltk
nltk.download('punkt')
import sys
import csv

#persian cuisine
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Iranian_cuisine')
article = scraped_data.read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# Removing Square Brackets and Extra Spaces
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
#any whitespace character \s+
article_text = re.sub(r'\s+', ' ', article_text)

In [3]:
# Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
#any whitespace character \s+
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

##Convert paragraphs to sentences

In [4]:
sentence_list = nltk.sent_tokenize(article_text)

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

###Loop to calculate the word frequencies. <br>Tokenize the sentences<br>if word is not a stopword and is in the word list, the count is added

In [6]:
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {}
for word in nltk.word_tokenize(formatted_article_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1


## Keys() method<br>The keys() method returns a view object. The view object contains the keys of the dictionary, as a list.

In [7]:
shoe = {
  "brand": "Nike",
  "series": "Air Max",
  "price": 100
}

var = shoe.keys()

print(var)


dict_keys(['brand', 'series', 'price'])


###When an item is added in the dictionary, the view object also gets updated:

In [9]:
shoe["color"] = "red"

print(var)

dict_keys(['brand', 'series', 'price', 'color'])


##Find weighted frequency of occurence

In [10]:
maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

In [11]:
shoe = {
  "brand": "Nike",
  "series": "Air Max",
  "price": 100
}

var = shoe.values()

print(var)

dict_values(['Nike', 'Air Max', 100])


###Replace words with weighted frequency in sentences

In [12]:
sentence_scores = {}
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

###Heap queue <br>heap queue algorithm, also known as the priority queue algorithm<br>It makes it possible to view the data (words/scores) -  our heap, as a regular Python list<br>heapq.nlargest(n, iterable, key=None)

In [13]:
import heapq
summary_sentences = heapq.nlargest(5, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)
summary

'Typical Iranian cuisine includes a wide variety of dishes, including several forms of kebab, stew, soup, and pilaf dishes, as well as various salads, desserts, pastries, and drinks. Apart from dishes of rice with kebab or stew, there are various rice-based Iranian dishes cooked in the traditional methods of polow and dami. It is followed by six chapters on the preparation of various dishes: four on rice dishes, one on qalya, and one on āsh. Baluchi cuisine also includes several date-based dishes, as well as various types of bread. Rose water, a flavored water made by steeping rose petals in water, is also a traditional and common ingredient in many Iranian dishes.'