In [1]:
import urllib.request
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from string import punctuation
from heapq import nlargest
from collections import defaultdict
import unicodedata
nltk.download('punkt')
nltk.download('stopwords')

# parse HTML
def parseHTML(page):
  return BeautifulSoup(page, "html.parser")

def getText(soup):
  return soup.find_all('section', { 'class', 'page-content--block_editor-content js--reframe' })

def tokenizeSentences(fullText):
  return sent_tokenize(fullText)

def tokenizeWords(fullText):
  return word_tokenize(fullText.lower())


def removeStopWords(words):
  # builds a set of stop words
  stop_words = set(stopwords.words('english') + list(punctuation) + ['“', '’', '”', '`', "'", '\\'])
  filteredWords = []
  # removes all stop words and punctuation from input
  for word in words:
    if word not in stop_words:
      filteredWords.append(word)
  return filteredWords

def concatenateText(foundSections):
  fullText = ""
  # concatenate the section texts together
  for section in foundSections:
    # removes all special unicode characters
    fullText += unicodedata.normalize('NFKD', section.text)
  return fullText


# opens up HTML of a webpage
page = urllib.request.urlopen("https://www.rogerebert.com/reviews/indiana-jones-and-the-dial-of-destiny-movie-review-2023").read().decode("utf-8") \

soup = parseHTML(page)

# Gets the text from section tags
foundSections = getText(soup)

fullText = concatenateText(foundSections)


# generates an array of sentences from fullText
sentences = tokenizeSentences(fullText)

# tokenize the words
words = tokenizeWords(fullText)

filteredWords = removeStopWords(words)

# gives frequency distribution of words
freq = FreqDist(filteredWords)

# get top 10 frequent words in freq
nlargest(10, freq, key = freq.get)

# ranks the sentences based on frequency of words in the total
ranking = defaultdict(int)
for i, sentence in enumerate(sentences):
  for w in word_tokenize(sentence.lower()):
    if w in freq:
      ranking[i] += freq[w]

sentenceIndexes = nlargest(4, ranking, key = ranking.get)
[sentences[j] for j in sorted(sentenceIndexes)]



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['The unsettling mix of good and bad starts in the first sequence, a flashback to the final days of World War II that features Indy (Harrison Ford) and a colleague named Basil Shaw (Toby Jones) trying to reclaim some of the historical artifacts being stolen by the fleeing Nazis.',
 'Indy escapes capture from a Nazi played by Thomas Kretschmann, but the important introduction here is that of a Nazi astrophysicist named Jurgen Voller (a de-aged Mads Mikkelsen), who discovers that, while looking for something called the Lance of Longinus, the Nazis have stumbled upon half of the Antikythera, or Archimedes’ Dial.',
 'Based on a real Ancient Greek item that could reportedly predict astronomical positions for decades, the dial is given the magical Indy franchise treatment in ways that I won’t spoil other than to say it’s not as explicitly religious as items like the Ark of the Covenant of The Holy Grail other than, as Voller says, it almost makes its owner God.',
 'From here, “The Dial of De