# Text summarization - Frequency based algorithm

# Preprocessing the texts

In [1]:
import re 
import nltk 
import string
from utils.visualize import visualize

nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\poseidon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\poseidon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# I added the word machine at the end of the last sentence
original_text = """Artificial intelligence is human like intelligence. 
                   It is the study of intelligent artificial agents. 
                   Science and engineering to produce intelligent machines. 
                   Solve problems and have intelligence. 
                   Related to intelligent behavior. 
                   Developing of reasoning machines. 
                   Learn from mistakes and successes. 
                   Artificial intelligence is related to reasoning in everyday situations."""

original_text = re.sub(r'\s+', ' ', original_text)

original_text

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial intelligence is related to reasoning in everyday situations.'

In [3]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  #print(tokens)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [4]:
formatted_text = preprocess(original_text)
formatted_text

'artificial intelligence human like intelligence study intelligent artificial agents science engineering produce intelligent machines solve problems intelligence related intelligent behavior developing reasoning machines learn mistakes successes artificial intelligence related reasoning everyday situations'

# Word frequency

In [5]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
word_frequency

FreqDist({'intelligence': 4, 'artificial': 3, 'intelligent': 3, 'machines': 2, 'related': 2, 'reasoning': 2, 'human': 1, 'like': 1, 'study': 1, 'agents': 1, ...})

In [6]:
highest_frequency = max(word_frequency.values())

for word in word_frequency.keys():
  #print(word)
  word_frequency[word] = (word_frequency[word] / highest_frequency)

# Sentence tokenization

In [7]:
sentence_list = nltk.sent_tokenize(original_text)
sentence_list

['Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of reasoning machines.',
 'Learn from mistakes and successes.',
 'Artificial intelligence is related to reasoning in everyday situations.']

# Generate the summary (score for sentences)

In [8]:
score_sentences = {}
for sentence in sentence_list:
  #print(sentence)
  for word in nltk.word_tokenize(sentence.lower()):
    #print(word)
    if sentence not in score_sentences.keys():
      score_sentences[sentence] = word_frequency[word]
    else:
      score_sentences[sentence] += word_frequency[word]

In [9]:
import heapq

best_sentences = heapq.nlargest(3, score_sentences, key = score_sentences.get)

print('Top-3 sentences with the highest score')
for idx, sentence in enumerate(best_sentences):
    print(f'Sentence {idx+1}: {sentence}')

# Create summary
summary = ' '.join(best_sentences)

Top-3 sentences with the highest score
Sentence 1: Artificial intelligence is human like intelligence.
Sentence 2: Artificial intelligence is related to reasoning in everyday situations.
Sentence 3: It is the study of intelligent artificial agents.


# Visualizing the summary in HTML

In [10]:
from IPython.core.display import HTML

text = ''
display(HTML(f'<h2>Summary</h2>'))
for sentence in sentence_list:
  #print(sentence)
  #text += sentence
  if sentence in best_sentences:
    text += ' ' + sentence.replace(sentence, f"<mark>{sentence}</mark>")
  else:
    text += ' ' + sentence

display(HTML(f"""{text}"""))

# Extracting texts from the Internet

In [11]:
from goose3.goose3 import Goose

g = Goose()
url = 'https://en.wikipedia.org/wiki/Automatic_summarization'
article = g.extract(url)

In [12]:
formatted_article = preprocess(article.cleaned_text)

In [13]:
def summarize(text, number_of_sentences, percentage = 0):
  original_text = text
  formatted_text = preprocess(original_text)

  word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
  highest_frequency = max(word_frequency.values())
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word] / highest_frequency)
  sentence_list = nltk.sent_tokenize(original_text)
  
  score_sentences = {}
  for sentence in sentence_list:
    for word in nltk.word_tokenize(sentence):
      if word in word_frequency.keys():
        if sentence not in score_sentences.keys():
          score_sentences[sentence] = word_frequency[word]
        else:
          score_sentences[sentence] += word_frequency[word]

  import heapq
  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(sentence_list) * percentage), score_sentences, key=score_sentences.get)
  else:
    best_sentences = heapq.nlargest(number_of_sentences, score_sentences, key=score_sentences.get)

  return sentence_list, best_sentences, word_frequency, score_sentences

In [14]:
sentence_list, best_sentences, word_frequency, score_sentences = summarize(article.cleaned_text, 100)

In [15]:
visualize(article.title, sentence_list, best_sentences)

# Summarizing multiple texts

In [16]:
article_list = ['https://en.wikipedia.org/wiki/Automatic_summarization',
                'https://en.wikipedia.org/wiki/Natural_language_processing',
                'https://en.wikipedia.org/wiki/Lemmatisation']

In [17]:
for url in article_list:
  #print(article)
  g = Goose()
  article = g.extract(url)
  sentence_list, best_sentences, _, _ = summarize(article.cleaned_text, 100, percentage=0.5)
  #print(len(sentence_list), len(best_sentences))
  visualize(article.title, sentence_list, best_sentences)