# Introduction

Read website content and summarize 

In [1]:
# Warnings
import warnings
warnings.filterwarnings('ignore')

# BEGIN: fix Python or Notebook SSL CERTIFICATE_VERIFY_FAILED
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
    ssl._create_default_https_context = ssl._create_unverified_context
# END: fix Python or Notebook SSL CERTIFICATE_VERIFY_FAILED

## Installing pre-requsite libraries

In [2]:
!python3 -m pip install --upgrade pip 



In [3]:
!pip install -U beautifulsoup4 nltk



### Import libraries

In [4]:
import urllib as url
import bs4 as bs
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download("stopwords")

import re, string, unicodedata
from heapq import nlargest
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

from IPython.display import HTML, display

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/krishnamanchikalapudi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krishnamanchikalapudi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.
    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable


def _score_sentences(sentences, freqTable) -> dict:
    """
    score a sentence by its words
    Basic algorithm: adding the frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = dict()

    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        word_count_in_sentence_except_stop_words = 0
        for wordValue in freqTable:
            if wordValue in sentence.lower():
                word_count_in_sentence_except_stop_words += 1
                if sentence[:10] in sentenceValue:
                    sentenceValue[sentence[:10]] += freqTable[wordValue]
                else:
                    sentenceValue[sentence[:10]] = freqTable[wordValue]

        if sentence[:10] in sentenceValue:
            sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] / word_count_in_sentence_except_stop_words

        '''
        Notice that a potential issue with our score algorithm is that long sentences will have an advantage over short sentences. 
        To solve this, we're dividing every sentence score by the number of words in the sentence.
        
        Note that here sentence[:10] is the first 10 character of any sentence, this is to save memory while saving keys of
        the dictionary.
        '''

    return sentenceValue


def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original text
    average = (sumValues / len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

### Read public url content

In [6]:
content = "https://www.scu.edu/ethics-in-technology-practice/ethical-lenses/"
article = url.request.urlopen(content)
article = article.read()
# article

## Parsing the article
* Beautiful Soup documentation at https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [7]:
parse_article = bs.BeautifulSoup(article, 'html.parser')

### Extract Pragraphs from the content

In [8]:
paragraphs = parse_article.find_all('p')
article_text = ''
for p in paragraphs:
    article_text += p.text 
# article_text

## Tokenize the article content

In [9]:
tokens = word_tokenize(article_text)
# tokens

In [10]:
stop_words = stopwords.words('english')
punctuation = string.punctuation + '\n'
# punctuation

### Find frequent words in the article

In [11]:
word_frequencies = {}
for word in tokens:    
    if word.lower() not in stop_words:
        if word.lower() not in punctuation:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    
# word_frequencies

In [12]:
max_frequency = 1

if (len(word_frequencies) > 0):
    max_frequency = max(word_frequencies.values())

print(max_frequency)

47


In [13]:
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word]/max_frequency

# print(word_frequencies)

In [14]:
sent_token = sent_tokenize(article_text)
# sent_token

In [15]:
sentence_scores = {}
for sent in sent_token:
    sentence = sent.split(" ")
    for word in sentence:        
        if word.lower() in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = word_frequencies[word.lower()]
            else:
                sentence_scores[sent] += word_frequencies[word.lower()]

# sentence_scores

In [16]:
select_length = int(len(sent_token)*0.3)
select_length

40

In [17]:
summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)

# summary

In [18]:
final_summary = [word for word in summary]

summary = ' '.join(final_summary)

# print(f"Original article words: {len(article_text)}, Summary words: {len(summary)}\n\n")
# summary = summary.replace('\u200c', '')
str_en = summary.encode("ascii", "ignore")
summary = str_en.decode()

# remove urls
# summary = re.sub(r"http\S+", "", summary)
summary = summary.replace('http', ' http')
# summary = re.sub(r'^https?:\/\/.*[\r\n]*', '', summary)

## Original content Summary

In [19]:
display(HTML(summary))

In [20]:
# 1 Create the word frequency table
freq_table = _create_frequency_table(summary)

# We already have a sentence tokenizer, so we just need  to run the sent_tokenize() method to create the array of sentences.

# 2 Tokenize the sentences
sentences = sent_tokenize(summary)

# 3 Important Algorithm: score the sentences
sentence_scores = _score_sentences(sentences, freq_table)

# 4 Find the threshold
threshold = _find_average_score(sentence_scores)

# 5 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)

print(f"Original article words: {len(article_text)}, Summary words: {len(summary)}\n\n")


Original article words: 28940, Summary words: 266




## Summary of Summary

In [21]:
display(HTML(summary))