In [66]:
# !pip3 install requests
# !pip3 install beautifulsoup4
# !pip3 install nltk

In [67]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kamilorwat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:

from collections import Counter
import re
import requests
import bs4
from nltk.corpus import stopwords

Scraping and cleaning up data and also asking user for preferences

In [69]:
url = "http://www.analytictech.com/mb021/mlk.htm"
page = requests.get(url)
page.raise_for_status()
soup = bs4.BeautifulSoup(page.text, "html.parser")
p_elems = [element.text for element in soup.find_all('p')]
speech = ' '.join(p_elems)

speech.replace('(mowing', 'knowing')
speech_edit = re.sub(r'\s+', ' ', speech)
speech_edit = re.sub(r'[^a-zA-Z\s]', ' ', speech_edit)

while True:
    max_words = input("How many words would you like to see in each sentence? ")
    num_sents = input("How many sentences would you like to see? ")
    if max_words.isdigit() and num_sents.isdigit():
        break
    print("Please enter a valid numbers.")
    

Remove non indexing words such as: the, as, is

In [70]:
def remove_stopwords(speech_edit: str) -> list[str]:
    stop_words = set(stopwords.words('english'))
    speech_edit_no_stop = [word for word in speech_edit.split() if word.lower() not in stop_words]
    return speech_edit_no_stop

In [71]:
speech_edit_no_stop = remove_stopwords(speech_edit)
speech_edit_no_stop[:10]

['happy',
 'join',
 'today',
 'go',
 'history',
 'greatest',
 'demonstration',
 'freedom',
 'history',
 'nation']

Calculating the frequences of given words

In [72]:
def calculate_frequencies(speech_edit_no_stop: list[str]) -> nltk.FreqDist:
    tokenized = nltk.word_tokenize(" ".join(speech_edit_no_stop).lower())
    word_freq = nltk.FreqDist(tokenized)
    return word_freq

In [73]:
word_freq = calculate_frequencies(speech_edit_no_stop)
word_freq.most_common(10)

[('freedom', 13),
 ('one', 12),
 ('ring', 12),
 ('dream', 11),
 ('let', 10),
 ('day', 9),
 ('negro', 8),
 ('today', 7),
 ('able', 7),
 ('every', 7)]

Applying scores to sentences and ranking them 

In [74]:
def score_sentences(speech: str, word_freq: nltk.FreqDist, max_words: int) -> dict[str, float]:
    scores = dict()
    sentences = nltk.sent_tokenize(speech)
    
    for sentence in sentences:
        scores[sentence] = 0
        words = nltk.word_tokenize(sentence.lower())
        sentence_length = len(words)
        if sentence_length <= int(max_words):
            for word in words:
                if word in word_freq.keys():
                    scores[sentence] += word_freq[word]
            scores[sentence] = scores[sentence] / sentence_length 
    return scores

In [75]:
scores = score_sentences(speech, word_freq, max_words)

counts = Counter(scores)
summary = counts.most_common(int(num_sents))
print("\nSummary: \n")

for i in summary:
    print(i[0])


Summary: 

From every mountainside, let freedom ring.
Let freedom ring from Lookout Mountain in Tennessee!
Let freedom ring from every hill and molehill in Mississippi.
Let freedom ring from the curvaceous slopes of California!
Let freedom ring from the snow capped Rockies of Colorado!
But one hundred years later the Negro is still not free.
From the
mighty mountains of New York, let freedom ring.
From the
prodigious hilltops of New Hampshire, let freedom ring.
And I say to you today my friends, let freedom ring.
I have a dream today.
It is a dream deeply rooted in the American
dream.
Free at
last!
Thank God almighty, we're free at last!"
We must not allow our creative protest to
degenerate into physical violence.
This is the faith that I go back to the
mount with.
