In [None]:
import urllib.request
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from string import punctuation
from heapq import nlargest
from collections import defaultdict
import unicodedata
nltk.download('punkt')
nltk.download('stopwords')

# parse HTML
def parseHTML(page):
  return BeautifulSoup(page, "html.parser")

def getText(soup):
  return soup.find_all('section', { 'class', 'page-content--block_editor-content js--reframe' })

def tokenizeSentences(fullText):
  return sent_tokenize(fullText)

def tokenizeWords(fullText):
  return word_tokenize(fullText.lower())


def removeStopWords(words):
  # builds a set of stop words
  stop_words = set(stopwords.words('english') + list(punctuation) + ['“', '’', '”', '`', "'", '\\'])
  filteredWords = []
  # removes all stop words and punctuation from input
  for word in words:
    if word not in stop_words:
      filteredWords.append(word)
  return filteredWords

def concatenateText(foundSections):
  fullText = ""
  # concatenate the section texts together
  for section in foundSections:
    # removes all special unicode characters
    fullText += unicodedata.normalize('NFKD', section.text)
  return fullText


# opens up HTML of a webpage
# the url can be changed to any desired movie review url from rogerebert.com
page = urllib.request.urlopen("https://www.rogerebert.com/reviews/indiana-jones-and-the-dial-of-destiny-movie-review-2023").read().decode("utf-8") \

soup = parseHTML(page)

# Gets the text from section tags
foundSections = getText(soup)

fullText = concatenateText(foundSections)


# generates an array of sentences from fullText
sentences = tokenizeSentences(fullText)

# tokenize the words
words = tokenizeWords(fullText)

filteredWords = removeStopWords(words)

# gives frequency distribution of words
freq = FreqDist(filteredWords)

# get top 10 frequent words in freq
nlargest(10, freq, key = freq.get)

# ranks the sentences based on frequency of words in the total
ranking = defaultdict(int)
for i, sentence in enumerate(sentences):
  for w in word_tokenize(sentence.lower()):
    if w in freq:
      ranking[i] += freq[w]

sentenceIndexes = nlargest(4, ranking, key = ranking.get)
[sentences[j] for j in sorted(sentenceIndexes)]

