In [2]:
from google.colab import files
files.upload()

Saving summ.txt to summ (1).txt


{'summ.txt': b'Automatic summarization is the process of shortening a text document with software, in order to create a summary with the major points of the original document. Technologies that can make a coherent summary take into account variables such as length, writing style and syntax.\n\nAutomatic data summarization is part of machine learning and data mining. The main idea of summarization is to find a subset of data which contains the "information" of the entire set. Such techniques are widely used in industry today. Search engines are an example; others include summarization of documents, image collections and videos. Document summarization tries to create a representative summary or abstract of the entire document, by finding the most informative sentences, while in image summarization the system finds the most representative and important (i.e. salient) images. For surveillance videos, one might want to extract the important events from the uneventful context.\n\nThere are t

In [3]:
import numpy as np
import nltk

FILE_NAME = 'summ.txt'
TOP_SENTS = 5

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
with open(FILE_NAME, 'r') as f:
  text_original = f.read()

In [0]:
def relevant_pos_tag(pt):
  if pt.startswith('NN') or pt.startswith('VB') or pt.startswith('JJ'):
    return 1
  
  return 0

def get_pos_tag(pt):
  if pt.startswith('NN'):
    return nltk.corpus.wordnet.NOUN
  elif pt.startswith('VB'):
    return nltk.corpus.wordnet.VERB
  elif pt.startswith('JJ'):
    return nltk.corpus.wordnet.ADJ
  
  return 0

def get_tf(sent):
  word_freq = {}
  total_words = 0
  for w in sent:
    total_words += 1
    
    if w not in word_freq:
      word_freq[w] = 1
    else:
      word_freq[w] += 1
  
  tf = 0
  for w in sent:
    tf += word_freq[w] / total_words
      
  tf /= total_words
  
  return tf

def get_idf(sent, doc_freq, total_sents):
  idf = 0
  ws = 0
  for w in sent:
    if w in doc_freq:
      ws += 1
      idf += total_sents / doc_freq[w]
      
  idf /= ws
  idf = np.log10(idf)
  
  return idf

def get_tf_idf(tf, idf):
  return tf * idf

wn_lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [15]:
# Process text

text = nltk.tokenize.sent_tokenize(text_original) # get sentences from the text
text = [(i, s, s) for i, s in enumerate(text)] # tuple of sentences, (t1, t2) - t1 - processed sentence, t2 - original sentence

# text = [('Ana has some apples', 'Ana has some apples'), ('I am so good', 'I am so good')]
text = [(pos, nltk.pos_tag(nltk.tokenize.word_tokenize(t1)), t2)
            for pos, t1, t2 in text] # get pos tags
text = [(pos, [(w, pt) for w, pt in t1 if relevant_pos_tag(pt)], t2) for pos, t1, t2 in text] # keep only the relevant pos tags
text = [(pos, [wn_lemmatizer.lemmatize(w, pos=get_pos_tag(pt)) for w, pt in t1], t2) for pos, t1, t2 in text] # lemmatize words
text = [(pos, [w.lower() for w in t1], t2) for pos, t1, t2 in text] # convert every word to lowercase
text = [(pos, [w for w in t1 if w not in stopwords
                        and len(w) > 1], t2) for pos, t1, t2 in text] # remove stopwords and words with len <= 1

for pos, t1, t2 in text:
  print(pos, t2, t1)

0 Automatic summarization is the process of shortening a text document with software, in order to create a summary with the major points of the original document. ['automatic', 'summarization', 'process', 'shorten', 'text', 'document', 'software', 'order', 'create', 'summary', 'major', 'point', 'original', 'document']
1 Technologies that can make a coherent summary take into account variables such as length, writing style and syntax. ['technologies', 'make', 'coherent', 'summary', 'take', 'account', 'variable', 'length', 'write', 'style', 'syntax']
2 Automatic data summarization is part of machine learning and data mining. ['automatic', 'data', 'summarization', 'part', 'machine', 'learning', 'data', 'mining']
3 The main idea of summarization is to find a subset of data which contains the "information" of the entire set. ['main', 'idea', 'summarization', 'find', 'subset', 'data', 'contain', 'information', 'entire', 'set']
4 Such techniques are widely used in industry today. ['technique'

In [0]:
doc_freq = {} # in how many documents a word appears
total_sents = len(text)

for _, t1, _ in text:
  ws = set()
  for w in t1:
    ws.add(w)
    
  for w in ws:
    if w not in doc_freq:
      doc_freq[w] = 1
    else:
      doc_freq[w] += 1

In [20]:
text_tf_idf = [(pos, t1, t2, get_tf(t1), get_idf(t1, doc_freq, total_sents)) for pos, t1, t2 in text] # t3 - tf, t4 - idf
text_tfidf = [(pos, t1, t2, get_tf_idf(t3, t4)) for pos, t1, t2, t3, t4 in text_tf_idf] # t3 - tf * idf

text_tfidf.sort(key=lambda k: k[2], reverse=True) # sort by tfidf

print(text_tfidf)

[(7, ['salient', 'image'], 'salient) images.', 0.47100402651115664), (9, ['general', 'approach', 'automatic', 'summarization', 'extraction', 'abstraction'], 'There are two general approaches to automatic summarization: extraction and abstraction.', 0.16981422402672897), (3, ['main', 'idea', 'summarization', 'find', 'subset', 'data', 'contain', 'information', 'entire', 'set'], 'The main idea of summarization is to find a subset of data which contains the "information" of the entire set.', 0.09999999999999999), (1, ['technologies', 'make', 'coherent', 'summary', 'take', 'account', 'variable', 'length', 'write', 'style', 'syntax'], 'Technologies that can make a coherent summary take into account variables such as length, writing style and syntax.', 0.10108308501337605), (4, ['technique', 'use', 'industry', 'today'], 'Such techniques are widely used in industry today.', 0.2552973247674845), (12, ['summary', 'include', 'verbal', 'innovation'], 'Such a summary might include verbal innovation

In [27]:
summ = text_tfidf[:TOP_SENTS]
summ.sort(key=lambda k: k[0])
for i, (pos, t1, t2, t3) in enumerate(summ):
  print(t2)

Technologies that can make a coherent summary take into account variables such as length, writing style and syntax.
The main idea of summarization is to find a subset of data which contains the "information" of the entire set.
Such techniques are widely used in industry today.
salient) images.
There are two general approaches to automatic summarization: extraction and abstraction.
