# Description:

Imagine reading and realize in the end this article is not what you are looking for. Feels bad. Don't fret, we got you covered. Use our "TLDR Bot" to help you figure out if a article is what you needed for.

Utilize TF-IDF to rank sentences based on importance and extract "useful" text.

# Import packages 

In [3]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import math
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Get text from file


In [39]:
with open("testarticle.txt", "r", encoding="utf-8") as file:
    text = file.read()

text = sent_tokenize(text)
print(text)
total_documents = len(text)

['As I have mentioned on my previous post, I am going to implement TF-IDF of a text which is a biography of the Beatles.', 'Bag of Words is an effective model to demonstrate documents as numerical vectors, but it is not enough to go further than enumeration.', 'TF-IDF is a technique that measures how important a word in a given document.', 'TF (Term Frequency) measures the frequency of a word in a document.', 'IDF (Inverse Document Frequency) measures the rank of the specific word for its relevancy within the text.', 'Stop words which contain unnecessary information such as “a”, “into” and “and” carry less importance in spite of their occurrence.', 'Thus, the TF-IDF is the product of TF and IDF: In order to acquire good results with TF-IDF, a huge corpus is necessary.', 'In my example, I just used a small sized corpus.', 'Since I removed stop words, result was pleasant.', 'As my previous code piece, we start again by adding modules to use their methods.', 'In this example, we utilize S

## Calculate Frequency Matrix for words Per Sentence(or document)

In [40]:
def create_frequency_matrix(sentences) -> dict:
    # set stopwords and stemmer
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()
    
    #initialize freqency matrix
    frequency_matrix = {}
    keys = []
    value_table = []
    
    # get first 15 chars per sentence as key for frequency matrix
    keys = list(map(lambda x: x[:15], sentences))
    #print(keys)
    
    #create list of dicts for word frequency for each sentence
    for sentence in sentences:
        freq_table = {}
        # tokenize words: SPLITTING
        tokenized = word_tokenize(sentence)
        # clean words: REDUCING
        for word in tokenized:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        value_table.append(freq_table)
        #print(freq_table)
    return dict(zip(keys, value_table))

freq_matrix = create_frequency_matrix(text)
print(freq_matrix)

{'As I have menti': {'mention': 1, 'previou': 1, 'post': 1, ',': 1, 'go': 1, 'implement': 1, 'tf-idf': 1, 'text': 1, 'biographi': 1, 'beatl': 1, '.': 1}, 'Bag of Words is': {'bag': 1, 'word': 1, 'effect': 1, 'model': 1, 'demonstr': 1, 'document': 1, 'numer': 1, 'vector': 1, ',': 1, 'enough': 1, 'go': 1, 'enumer': 1, '.': 1}, 'TF-IDF is a tec': {'tf-idf': 1, 'techniqu': 1, 'measur': 1, 'import': 1, 'word': 1, 'given': 1, 'document': 1, '.': 1}, 'TF (Term Freque': {'tf': 1, '(': 1, 'term': 1, 'frequenc': 2, ')': 1, 'measur': 1, 'word': 1, 'document': 1, '.': 1}, 'IDF (Inverse Do': {'idf': 1, '(': 1, 'invers': 1, 'document': 1, 'frequenc': 1, ')': 1, 'measur': 1, 'rank': 1, 'specif': 1, 'word': 1, 'relev': 1, 'within': 1, 'text': 1, '.': 1}, 'Stop words whic': {'stop': 1, 'word': 1, 'contain': 1, 'unnecessari': 1, 'inform': 1, '“': 3, '”': 3, ',': 1, 'carri': 1, 'less': 1, 'import': 1, 'spite': 1, 'occurr': 1, '.': 1}, 'Thus, the TF-ID': {'thu': 1, ',': 2, 'tf-idf': 2, 'product': 1, 'tf':

# Create TF Matrix

In [41]:
def create_tf_matrix(freq_matrix):
    tf_matrix = {}
    
    for sentence, freq_table in freq_matrix.items():
        tf_table = {}
        
#         diff_words_in_sentence = len(freq_table)
        diff_words_in_sentence = sum([freq_table[t] for t in freq_table])
        
        for word, count in freq_table.items():
            tf_table[word] = count / diff_words_in_sentence
            
        tf_matrix[sentence] = tf_table
        
    return tf_matrix

tf_matrix = create_tf_matrix(freq_matrix)
print(tf_matrix)

{'As I have menti': {'mention': 0.09090909090909091, 'previou': 0.09090909090909091, 'post': 0.09090909090909091, ',': 0.09090909090909091, 'go': 0.09090909090909091, 'implement': 0.09090909090909091, 'tf-idf': 0.09090909090909091, 'text': 0.09090909090909091, 'biographi': 0.09090909090909091, 'beatl': 0.09090909090909091, '.': 0.09090909090909091}, 'Bag of Words is': {'bag': 0.07692307692307693, 'word': 0.07692307692307693, 'effect': 0.07692307692307693, 'model': 0.07692307692307693, 'demonstr': 0.07692307692307693, 'document': 0.07692307692307693, 'numer': 0.07692307692307693, 'vector': 0.07692307692307693, ',': 0.07692307692307693, 'enough': 0.07692307692307693, 'go': 0.07692307692307693, 'enumer': 0.07692307692307693, '.': 0.07692307692307693}, 'TF-IDF is a tec': {'tf-idf': 0.125, 'techniqu': 0.125, 'measur': 0.125, 'import': 0.125, 'word': 0.125, 'given': 0.125, 'document': 0.125, '.': 0.125}, 'TF (Term Freque': {'tf': 0.1, '(': 0.1, 'term': 0.1, 'frequenc': 0.2, ')': 0.1, 'measur

# Creating a table for documents per words 
### For each word, count how many times it appears in the documents

In [42]:
#calculate document frequency
def create_documents_per_words(freq_matrix):
    doc_per_word_table = {}
    
    for _, word_freq in freq_matrix.items():
        for word, freq in word_freq.items():
            if word in doc_per_word_table:
                doc_per_word_table[word] += 1
            else:
                doc_per_word_table[word] = 1
    
    return doc_per_word_table

doc_per_word = create_documents_per_words(freq_matrix)
print(doc_per_word)

{'mention': 2, 'previou': 2, 'post': 1, ',': 15, 'go': 4, 'implement': 2, 'tf-idf': 6, 'text': 5, 'biographi': 1, 'beatl': 1, '.': 34, 'bag': 1, 'word': 13, 'effect': 1, 'model': 1, 'demonstr': 1, 'document': 9, 'numer': 2, 'vector': 1, 'enough': 1, 'enumer': 1, 'techniqu': 1, 'measur': 4, 'import': 2, 'given': 3, 'tf': 4, '(': 3, 'term': 3, 'frequenc': 4, ')': 3, 'idf': 2, 'invers': 1, 'rank': 2, 'specif': 1, 'relev': 1, 'within': 1, 'stop': 3, 'contain': 2, 'unnecessari': 1, 'inform': 4, '“': 2, '”': 2, 'carri': 1, 'less': 1, 'spite': 1, 'occurr': 1, 'thu': 1, 'product': 1, ':': 2, 'order': 2, 'acquir': 2, 'good': 1, 'result': 2, 'huge': 1, 'corpu': 2, 'necessari': 2, 'exampl': 2, 'use': 5, 'small': 1, 'size': 1, 'sinc': 1, 'remov': 3, 'wa': 1, 'pleasant': 1, 'code': 2, 'piec': 1, 'start': 1, 'ad': 1, 'modul': 1, 'method': 2, 'thi': 1, 'util': 2, 'scikit-learn': 3, 'besid': 1, 'numpi': 1, 'panda': 1, 'regular': 2, 'express': 2, 'free': 1, 'machin': 1, 'learn': 1, 'librari': 1, 'pytho

# Create IDF matrix

In [43]:
def create_idf_matrix(freq_matrix, doc_per_word_, total_documents):
    idf_matrix = {}
    
    for sentence, freq_table in freq_matrix.items():
        idf_table = {}
        
        for word in freq_table.keys():
            idf_table[word] = math.log10(total_documents/ float(doc_per_word[word]))
            
        idf_matrix[sentence] = idf_table
        
    return idf_matrix

idf_matrix = create_idf_matrix(freq_matrix, doc_per_word, total_documents)
print(idf_matrix)

{'As I have menti': {'mention': 1.2430380486862944, 'previou': 1.2430380486862944, 'post': 1.5440680443502757, ',': 0.36797678529459443, 'go': 0.9420080530223133, 'implement': 1.2430380486862944, 'tf-idf': 0.7659167939666319, 'text': 0.8450980400142568, 'biographi': 1.5440680443502757, 'beatl': 1.5440680443502757, '.': 0.012589127308020467}, 'Bag of Words is': {'bag': 1.5440680443502757, 'word': 0.4301246920434389, 'effect': 1.5440680443502757, 'model': 1.5440680443502757, 'demonstr': 1.5440680443502757, 'document': 0.5898255349109508, 'numer': 1.2430380486862944, 'vector': 1.5440680443502757, ',': 0.36797678529459443, 'enough': 1.5440680443502757, 'go': 0.9420080530223133, 'enumer': 1.5440680443502757, '.': 0.012589127308020467}, 'TF-IDF is a tec': {'tf-idf': 0.7659167939666319, 'techniqu': 1.5440680443502757, 'measur': 0.9420080530223133, 'import': 1.2430380486862944, 'word': 0.4301246920434389, 'given': 1.066946789630613, 'document': 0.5898255349109508, '.': 0.012589127308020467}, '

# Calculate TF-IDF and generate a matrix

In [44]:
def create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}
    
    for (sentence1, freq_table1), (sentence2, freq_table2) in zip(tf_matrix.items(), idf_matrix.items()):
        
        tf_idf_table = {}
        
        for (word1, value1), (word2, value2) in zip(freq_table1.items(), freq_table2.items()):
            tf_idf_table[word1] = float(value1 * value2)
            
        tf_idf_matrix[sentence1] = tf_idf_table
        
    return tf_idf_matrix

tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'As I have menti': {'mention': 0.11300345897148131, 'previou': 0.11300345897148131, 'post': 0.14036982221366143, ',': 0.033452435026781316, 'go': 0.08563709572930121, 'implement': 0.11300345897148131, 'tf-idf': 0.069628799451512, 'text': 0.07682709454675062, 'biographi': 0.14036982221366143, 'beatl': 0.14036982221366143, '.': 0.0011444661189109516}, 'Bag of Words is': {'bag': 0.11877446495002121, 'word': 0.033086514772572226, 'effect': 0.11877446495002121, 'model': 0.11877446495002121, 'demonstr': 0.11877446495002121, 'document': 0.045371194993150066, 'numer': 0.09561831143740727, 'vector': 0.11877446495002121, ',': 0.02830590656112265, 'enough': 0.11877446495002121, 'go': 0.07246215792479334, 'enumer': 0.11877446495002121, '.': 0.0009683944083092667}, 'TF-IDF is a tec': {'tf-idf': 0.09573959924582899, 'techniqu': 0.19300850554378446, 'measur': 0.11775100662778916, 'import': 0.1553797560857868, 'word': 0.05376558650542986, 'given': 0.13336834870382663, 'document': 0.07372819186386885,

# Score each sentence

In [45]:
def score_sentences(tf_idf_matrix) -> dict:
    sentenceValue = {}
    
    for sentence, score_table in tf_idf_matrix.items():
        total_score = 0
        
        diff_words_in_sentence = len(score_table)
        # total_num_wrods_in_sentence = sum([freq_table[t] for t in freq_table])
        
        
        for word, score in score_table.items():
            total_score += score
            
        sentenceValue[sentence] = total_score / diff_words_in_sentence
        
    return sentenceValue;

sentenceScores = score_sentences(tf_idf_matrix)
print(sentenceScores)

{'As I have menti': 0.09334633949351677, 'Bag of Words is': 0.08517182574980796, 'TF-IDF is a tec': 0.10303932943622715, 'TF (Term Freque': 0.08890457705826113, 'IDF (Inverse Do': 0.07427497629773576, 'Stop words whic': 0.08128686531166951, 'Thus, the TF-ID': 0.07098008963552137, 'In my example, ': 0.13877298242224514, 'Since I removed': 0.10888674525486915, 'As my previous ': 0.11131050276076856, 'In this example': 0.09821146863389468, 'Scikit-learn is': 0.17958930895285732, 'We will utilize': 0.10699686511917869, 'TfidfTransforme': 0.10370627792653646, 'Data is fetched': 0.10359830811884323, 'Regular express': 0.11887453212781736, 'We use 4 parame': 0.1619399822796515, 'First one is st': 0.1046122725713087, '‘None’ can be g': 0.0754990093888466, 'In Scikit-learn': 0.10299635939413258, 'min_df paramete': 0.10397795074803873, 'max_df is the c': 0.19938965288212795, 'If the document': 0.098542863060243, 'ngram_range(x,y': 0.08149422807309707, 'x is for the mi': 0.12241228100666511, 'fit

# Calculating threshold value used for choosing important sentences

In [46]:
def find_average_score(sentenceScores) -> int:
    sumScores = sum([sentenceScores[entry] for entry in sentenceScores])
    average = (sumScores / len(sentenceScores))
    
    return average

threshold = find_average_score(sentenceScores)        
print(threshold)

0.11798479929412617


# Generate the summary

In [47]:
def generate_summary(sentences, sentenceScores, threshold):
    sentence_count = 0
    summary = ''
    
    for sentence in sentences:
        if sentence[:15] in sentenceScores and sentenceScores[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1
            
    return summary

summary = generate_summary(text, sentenceScores, threshold)
print(summary)

 In my example, I just used a small sized corpus. Scikit-learn is a free machine learning library for python. Regular expression helps separation of sentences using marks then sentences are enlisted under sentences object. We use 4 parameters in CountVectorizer method. max_df is the contrast of min_df parameter. x is for the minimum n value, y represents the maximum n value for n-grams. fit_transform returns transform version of sentences. I wanted share my experience on it. You can find full code from my repository. In our next article we are going to continue with implementing word2vec to make relationships between words.


# Test with dataset

In [49]:
# Get text from file
with open("testarticle.txt", "r", encoding="utf-8") as file:
    text = file.read()

text = sent_tokenize(text)
print("Text:")
print(text[:20], "\n")
total_documents = len(text)

# Calculate Frequency Matrix
freq_matrix = create_frequency_matrix(text)
# print("Frequency Matrix:")
# print(list(freq_matrix)[:5], "\n")

# Create Term Frequency Matrix
tf_matrix = create_tf_matrix(freq_matrix)
# print("Term Frequency Matrix:")
# print(list(tf_matrix)[:5], "\n")

# Create Document counts per word Matrix
doc_per_word = create_documents_per_words(freq_matrix)
# print("Number of documents each word appear in:")
# print(list(doc_per_word)[:5], "\n")

# Create Inverse Document Frequency Matrix
idf_matrix = create_idf_matrix(freq_matrix, doc_per_word, total_documents)
# print("IDF Matrix:")
# print(list(idf_matrix)[:5], "\n")

# Create TF-IDF matrix
tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)
# print("TF-IDF Matrix:")
# print(list(tf_idf_matrix)[:5], "\n")

# Score each sentence
sentenceScores = score_sentences(tf_idf_matrix)
# print("Scores of each sentence:")
# print(list(sentenceScores)[:5], "\n")

# Calculate the threshold to select important sentences for summary
threshold = find_average_score(sentenceScores)
# print("threshold:")
# print(threshold, "\n")

# Generate the summary
summary = generate_summary(text, sentenceScores, threshold)
print("Summary:")
print(summary)

Text:
['As I have mentioned on my previous post, I am going to implement TF-IDF of a text which is a biography of the Beatles.', 'Bag of Words is an effective model to demonstrate documents as numerical vectors, but it is not enough to go further than enumeration.', 'TF-IDF is a technique that measures how important a word in a given document.', 'TF (Term Frequency) measures the frequency of a word in a document.', 'IDF (Inverse Document Frequency) measures the rank of the specific word for its relevancy within the text.', 'Stop words which contain unnecessary information such as “a”, “into” and “and” carry less importance in spite of their occurrence.', 'Thus, the TF-IDF is the product of TF and IDF: In order to acquire good results with TF-IDF, a huge corpus is necessary.', 'In my example, I just used a small sized corpus.', 'Since I removed stop words, result was pleasant.', 'As my previous code piece, we start again by adding modules to use their methods.', 'In this example, we uti

 # Calculate Frequency Matrix for words Per Sentence(or document)

In [37]:
#this function should iterate through each sentence and save the frequency of each word in the sentence

def create_frequency_matrix(sentences) -> dict:
    # set stopwords and stemmer
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()
    
    #initialize freqency matrix
    frequency_matrix = {}
    keys = []
    value_table = []
    
    # get first 15 chars per sentence as key for frequency matrix
    keys = list(map(lambda x: x[:15], sentences))
    #print(keys)
    
    #create list of dicts for word frequency for each sentence
    for sentence in sentences:
        freq_table = {}
        # tokenize words: SPLITTING
        tokenized = word_tokenize(sentence)
        # clean words: REDUCING
        for word in tokenized:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        value_table.append(freq_table)
        #print(freq_table)
    return dict(zip(keys, value_table))

# open file
with open("testarticle.txt", "r", encoding="utf-8") as file:
    text = file.read()

# tokenize article text into sentence and save as a list.
# MAP REDUCE SPLITTING PHASE
text = sent_tokenize(text)

frequency_matrix = create_frequency_matrix(text)
print(frequency_matrix)
#print(mapping)






['As I have menti', 'Bag of Words is', 'TF-IDF is a tec', 'TF (Term Freque', 'IDF (Inverse Do', 'Stop words whic', 'Thus, the TF-ID', 'In my example, ', 'Since I removed', 'As my previous ', 'In this example', 'Scikit-learn is', 'We will utilize', 'TfidfTransforme', 'Data is fetched', 'Regular express', 'We use 4 parame', 'First one is st', '‘None’ can be g', 'In Scikit-learn', 'min_df paramete', 'max_df is the c', 'If the document', 'ngram_range(x,y', 'x is for the mi', 'fit_transform r', 'We transform a ', 'As I mentioned ', 'At the end of t', 'Finally, we can', 'TF-IDF is a num', 'I wanted share ', 'You can find fu', 'In our next art', 'To say me “hi” ']
{'As I have menti': {'mention': 1, 'previou': 1, 'post': 1, ',': 1, 'go': 1, 'implement': 1, 'tf-idf': 1, 'text': 1, 'biographi': 1, 'beatl': 1, '.': 1}, 'Bag of Words is': {'bag': 1, 'word': 1, 'effect': 1, 'model': 1, 'demonstr': 1, 'document': 1, 'numer': 1, 'vector': 1, ',': 1, 'enough': 1, 'go': 1, 'enumer': 1, '.': 1}, 'TF-IDF