In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re

In [2]:
text = """A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery.

The spruce has an antenna which can receive Bluetooth texts sent by visitors to the Tate. The messages will be "unwrapped" by sculptor Richard Wentworth, who is responsible for decorating the tree with broken plates and light bulbs. It is the 17th year that the gallery has invited an artist to dress their Christmas tree. Artists who have decorated the Tate tree in previous years include Tracey Emin in 2002.

The plain green Norway spruce is displayed in the gallery's foyer. Its light bulb adornments are dimmed, ordinary domestic ones joined together with string. The plates decorating the branches will be auctioned off for the children's charity ArtWorks. Wentworth worked as an assistant to sculptor Henry Moore in the late 1960s. His reputation as a sculptor grew in the 1980s, while he has been one of the most influential teachers during the last two decades. Wentworth is also known for his photography of mundane, everyday subjects such as a cigarette packet jammed under the wonky leg of a table. """

In [3]:
#lemma
def lemmatization_func(text:str) -> str:
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens] 
    lemma_word = []
    wordnet_lemmatizer = WordNetLemmatizer()
    for w in tokens:
        word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
        word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
        word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
        lemma_word.append(word3)
    preprocessed_text = ' '.join(lemma_word)

    return preprocessed_text

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stopWords = set(stopwords.words("english"))
lemma_text = lemmatization_func(text)
words = word_tokenize(text)
sentences = sent_tokenize(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# 1.Origin WordCount Method

* [easy WordCount website link](https://www.mygreatlearning.com/blog/text-summarization-in-python/)

In [None]:
freqTable = dict()
for word in words:
  word = word.lower()
  if word in stopWords:
    continue
  if word in freqTable:
    freqTable[word] += 1
  else:
    freqTable[word] = 1

sentences = sent_tokenize(text)
sentenceValue = dict()


In [None]:
for sentence in sentences:
  for word, freq in freqTable.items():
    if word in sentence.lower():
      if sentence in sentenceValue:
        sentenceValue[sentence] += freq
      else:
        sentenceValue[sentence] = freq

sumValues = 0
for sentence in sentenceValue:
  sumValues += sentenceValue[sentence]

average = int(sumValues / len(sentenceValue))

summary_origin_WordCount = ''
for sentence in sentences:
  if(sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):
    summary_origin_WordCount += " " + sentence
print(summary_origin_WordCount)

 A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery. The messages will be "unwrapped" by sculptor Richard Wentworth, who is responsible for decorating the tree with broken plates and light bulbs.


In [None]:
candidate_origin_WordCount = summary_origin_WordCount

# 2.New WordCount Method




In [None]:
def create_freq_table(text_string):
  words = word_tokenize(text_string)  
  ps = PorterStemmer()
  freq_table = {}
  for word in words:
      #stem word 
      word = ps.stem(word)
      
      #remove stopwords
      if word in stopWords: 
        continue
      elif word in freq_table:
        freq_table[word] += 1
      else:
        freq_table[word] = 1
          
  return freq_table
#freq_table = create_freq_table(" ".join(sentences))

In [None]:
def score_sentences(sentences, freq_table):
  sentence_value = {}
  for sentence in sentences:
    word_count_in_sentence = len(word_tokenize(sentence))
    for wordValue in freq_table:
      if wordValue.lower() in sentence.lower():                
        if sentence in sentence_value:
          sentence_value[sentence] += freq_table[wordValue]
        else:
          sentence_value[sentence] = freq_table[wordValue]

    sentence_value[sentence] = sentence_value[sentence] // word_count_in_sentence
    
  return sentence_value

In [None]:
def find_average_score(sentence_value):
  sum_values = 0
  for entry in sentence_value:
    sum_values += sentence_value[entry]
      
  average = int(sum_values/len(sentence_value))
  
  return average

In [None]:
def generate_summary(sentences, sentence_value, threshold):
  sentence_count = 0
  summary = ''
  for sentence in sentences:
    if sentence in sentence_value and sentence_value[sentence] > threshold:
      summary += " " + sentence
      sentence_count += 1
          
  return summary

In [None]:

freq_table = create_freq_table(text)
sentence_scores = score_sentences(sentences, freq_table)
threshold = find_average_score(sentence_scores)
summary_new_WordCount = generate_summary(sentences, sentence_scores, 1 * threshold)
print(re.sub('\n','',summary_new_WordCount))

 A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery. Artists who have decorated the Tate tree in previous years include Tracey Emin in 2002. The plates decorating the branches will be auctioned off for the children's charity ArtWorks.


# 3.TextRank with Sentence Embeddings

In [None]:
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embedding = embed(sentences)
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)
tf.disable_v2_behavior()
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embedding)

In [None]:
#generate cosine similarity matrix
sim_matrix = cosine_similarity(message_embeddings)
#create graph and generate scores from pagerank algorithms
nx_graph = nx.from_numpy_array(sim_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
   
num_of_sentences = 5
    
summary_sentence_embedding = " ".join([i[1] for i in ranked_sentences[:num_of_sentences]])
print(summary_sentence_embedding)

A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery. The messages will be "unwrapped" by sculptor Richard Wentworth, who is responsible for decorating the tree with broken plates and light bulbs. Artists who have decorated the Tate tree in previous years include Tracey Emin in 2002. It is the 17th year that the gallery has invited an artist to dress their Christmas tree. The spruce has an antenna which can receive Bluetooth texts sent by visitors to the Tate.


# 4.Bert Extractive

In [9]:
!pip install bert-extractive-summarizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl (25 kB)
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 34.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingfa

In [11]:
!pip install sacremoses

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 28.2 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=48c7b4f0ab07b3b33934491427fb64b5a03f3cae999b899109f91cf7d489aded
  Stored in directory: /root/.cache/pip/wheels/87/39/dd/a83eeef36d0bf98e7a4d1933a4ad2d660295a40613079bafc9
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53


In [13]:
from summarizer import Summarizer
model = Summarizer()


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery. The spruce has an antenna which can receive Bluetooth texts sent by visitors to the Tate. It is the 17th year that the gallery has invited an artist to dress their Christmas tree.


In [17]:
result = model(text)
summary_bert = "".join(result)
print(summary_bert)

A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery. The spruce has an antenna which can receive Bluetooth texts sent by visitors to the Tate. It is the 17th year that the gallery has invited an artist to dress their Christmas tree.


# Metric Evalutaion

In [15]:
!pip install rouge
from rouge import Rouge
ROUGE = Rouge()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [16]:
reference="The messages will be 'unwrapped' by sculptor Richard Wentworth, who is responsible for decorating the tree with broken plates and light bulbs.A Christmas tree that can receive text messages has been unveiled at London's Tate Britain art gallery.It is the 17th year that the gallery has invited an artist to dress their Christmas tree.The spruce has an antenna which can receive Bluetooth texts sent by visitors to the Tate.His reputation as a sculptor grew in the 1980s, while he has been one of the most influential teachers during the last two decades."
#candidate_new_WordCount = summary_new_WordCount


In [None]:
# just see recall
ROUGE.get_scores(summary_origin_WordCount, reference)

[{'rouge-1': {'f': 0.6666666621622086,
   'p': 0.972972972972973,
   'r': 0.5070422535211268},
  'rouge-2': {'f': 0.5384615343242604,
   'p': 0.9210526315789473,
   'r': 0.3804347826086957},
  'rouge-l': {'f': 0.6666666621622086,
   'p': 0.972972972972973,
   'r': 0.5070422535211268}}]

In [None]:
ROUGE.get_scores(summary_new_WordCount, reference)

[{'rouge-1': {'f': 0.4642857096444516,
   'p': 0.6341463414634146,
   'r': 0.36619718309859156},
  'rouge-2': {'f': 0.27737225836219304,
   'p': 0.4222222222222222,
   'r': 0.20652173913043478},
  'rouge-l': {'f': 0.41071428107302305,
   'p': 0.5609756097560976,
   'r': 0.323943661971831}}]

In [None]:
ROUGE.get_scores(summary_sentence_embedding, reference)

[{'rouge-1': {'f': 0.7910447711372245,
   'p': 0.8412698412698413,
   'r': 0.7464788732394366},
  'rouge-2': {'f': 0.7428571378703674,
   'p': 0.7831325301204819,
   'r': 0.7065217391304348},
  'rouge-l': {'f': 0.7761193980028959,
   'p': 0.8253968253968254,
   'r': 0.7323943661971831}}]

In [18]:
ROUGE.get_scores(summary_bert, reference)

[{'rouge-1': {'f': 0.6851851806807271, 'p': 1.0, 'r': 0.5211267605633803},
  'rouge-2': {'f': 0.6474820099125305,
   'p': 0.9574468085106383,
   'r': 0.4891304347826087},
  'rouge-l': {'f': 0.6851851806807271, 'p': 1.0, 'r': 0.5211267605633803}}]