# Description:

Imagine reading and realize in the end this article is not what you are looking for. Feels bad. Don't fret, we got you covered. Use our "TLDR Bot" to help you figure out if a article is what you needed for.

Utilize TF-IDF to rank sentences based on importance and extract "useful" text.

# Import packages 

In [78]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import math
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xiaokunmo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xiaokunmo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Get text from file


In [79]:
with open("testarticle2.txt", "r", encoding="utf-8") as file:
    text = file.read()

text = sent_tokenize(text)
print(text)
total_documents = len(text)

['Those Who Are Resilient Stay In The Game Longer\n“On the mountains of truth you can never climb in vain: either you will reach a point higher up today, or you will be training your powers so that you will be able to climb higher tomorrow.”\u200a—\u200aFriedrich Nietzsche\nChallenges and setbacks are not meant to defeat you, but promote you.', 'However, I realise after many years of defeats, it can crush your spirit and it is easier to give up than risk further setbacks and disappointments.', 'Have you experienced this before?', 'To be honest, I don’t have the answers.', 'I can’t tell you what the right course of action is; only you will know.', 'However, it’s important not to be discouraged by failure when pursuing a goal or a dream, since failure itself means different things to different people.', 'To a person with a Fixed Mindset failure is a blow to their self-esteem, yet to a person with a Growth Mindset, it’s an opportunity to improve and find new ways to overcome their obstacl

## Calculate Frequency Matrix for words Per Sentence(or document)

In [80]:
def create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

freq_matrix = create_frequency_matrix(text)
print(freq_matrix)

{'Those Who Are R': {'resili': 1, 'stay': 1, 'game': 1, 'longer': 1, '“': 1, 'mountain': 1, 'truth': 1, 'never': 1, 'climb': 2, 'vain': 1, ':': 1, 'either': 1, 'reach': 1, 'point': 1, 'higher': 2, 'today': 1, ',': 2, 'train': 1, 'power': 1, 'abl': 1, 'tomorrow.': 1, '”': 1, '—': 1, 'friedrich': 1, 'nietzsch': 1, 'challeng': 1, 'setback': 1, 'meant': 1, 'defeat': 1, 'promot': 1, '.': 1}, 'However, I real': {'howev': 1, ',': 2, 'realis': 1, 'mani': 1, 'year': 1, 'defeat': 1, 'crush': 1, 'spirit': 1, 'easier': 1, 'give': 1, 'risk': 1, 'setback': 1, 'disappoint': 1, '.': 1}, 'Have you experi': {'experienc': 1, 'thi': 1, 'befor': 1, '?': 1}, 'To be honest, I': {'honest': 1, ',': 1, '’': 1, 'answer': 1, '.': 1}, 'I can’t tell yo': {'’': 1, 'tell': 1, 'right': 1, 'cours': 1, 'action': 1, ';': 1, 'onli': 1, 'know': 1, '.': 1}, 'However, it’s i': {'howev': 1, ',': 2, '’': 1, 'import': 1, 'discourag': 1, 'failur': 2, 'pursu': 1, 'goal': 1, 'dream': 1, 'sinc': 1, 'mean': 1, 'differ': 2, 'thing': 

# Create TF Matrix

In [81]:
def create_tf_matrix(freq_matrix):
    tf_matrix = {}
    
    for sentence, freq_table in freq_matrix.items():
        tf_table = {}
        
#         diff_words_in_sentence = len(freq_table)
        diff_words_in_sentence = sum([freq_table[t] for t in freq_table])
        
        for word, count in freq_table.items():
            tf_table[word] = count / diff_words_in_sentence
            
        tf_matrix[sentence] = tf_table
        
    return tf_matrix

tf_matrix = create_tf_matrix(freq_matrix)
print(tf_matrix)

{'Those Who Are R': {'resili': 0.029411764705882353, 'stay': 0.029411764705882353, 'game': 0.029411764705882353, 'longer': 0.029411764705882353, '“': 0.029411764705882353, 'mountain': 0.029411764705882353, 'truth': 0.029411764705882353, 'never': 0.029411764705882353, 'climb': 0.058823529411764705, 'vain': 0.029411764705882353, ':': 0.029411764705882353, 'either': 0.029411764705882353, 'reach': 0.029411764705882353, 'point': 0.029411764705882353, 'higher': 0.058823529411764705, 'today': 0.029411764705882353, ',': 0.058823529411764705, 'train': 0.029411764705882353, 'power': 0.029411764705882353, 'abl': 0.029411764705882353, 'tomorrow.': 0.029411764705882353, '”': 0.029411764705882353, '—': 0.029411764705882353, 'friedrich': 0.029411764705882353, 'nietzsch': 0.029411764705882353, 'challeng': 0.029411764705882353, 'setback': 0.029411764705882353, 'meant': 0.029411764705882353, 'defeat': 0.029411764705882353, 'promot': 0.029411764705882353, '.': 0.029411764705882353}, 'However, I real': {'

# Creating a table for documents per words 
### For each word, count how many times it appears in the documents

In [82]:
def create_documents_per_words(freq_matrix):
    doc_per_word_table = {}
    
    for _, word_freq in freq_matrix.items():
        for word, freq in word_freq.items():
            if word in doc_per_word_table:
                doc_per_word_table[word] += 1
            else:
                doc_per_word_table[word] = 1
    
    return doc_per_word_table

doc_per_word = create_documents_per_words(freq_matrix)
print(doc_per_word)

{'resili': 2, 'stay': 2, 'game': 3, 'longer': 2, '“': 5, 'mountain': 1, 'truth': 1, 'never': 2, 'climb': 1, 'vain': 1, ':': 8, 'either': 1, 'reach': 1, 'point': 2, 'higher': 1, 'today': 1, ',': 22, 'train': 1, 'power': 4, 'abl': 1, 'tomorrow.': 1, '”': 5, '—': 3, 'friedrich': 1, 'nietzsch': 1, 'challeng': 2, 'setback': 2, 'meant': 1, 'defeat': 3, 'promot': 1, '.': 45, 'howev': 2, 'realis': 2, 'mani': 3, 'year': 4, 'crush': 1, 'spirit': 1, 'easier': 1, 'give': 4, 'risk': 1, 'disappoint': 2, 'experienc': 1, 'thi': 4, 'befor': 2, '?': 6, 'honest': 1, '’': 16, 'answer': 2, 'tell': 2, 'right': 4, 'cours': 1, 'action': 3, ';': 1, 'onli': 3, 'know': 5, 'import': 1, 'discourag': 1, 'failur': 4, 'pursu': 3, 'goal': 4, 'dream': 6, 'sinc': 1, 'mean': 4, 'differ': 3, 'thing': 2, 'peopl': 3, 'person': 4, 'fix': 1, 'mindset': 2, 'blow': 1, 'self-esteem': 1, 'yet': 2, 'growth': 1, 'opportun': 1, 'improv': 1, 'find': 2, 'new': 1, 'way': 2, 'overcom': 2, 'obstacl': 1, 'respons': 1, 'wrong': 1, 'neither

# Create IDF matrix

In [83]:
def create_idf_matrix(freq_matrix, doc_per_word_, total_documents):
    idf_matrix = {}
    
    for sentence, freq_table in freq_matrix.items():
        idf_table = {}
        
        for word in freq_table.keys():
            idf_table[word] = math.log10(total_documents/ float(doc_per_word[word]))
            
        idf_matrix[sentence] = idf_table
        
    return idf_matrix

idf_matrix = create_idf_matrix(freq_matrix, doc_per_word, total_documents)
print(idf_matrix)

{'Those Who Are R': {'resili': 1.414973347970818, 'stay': 1.414973347970818, 'game': 1.2388820889151366, 'longer': 1.414973347970818, '“': 1.0170333392987803, 'mountain': 1.7160033436347992, 'truth': 1.7160033436347992, 'never': 1.414973347970818, 'climb': 1.7160033436347992, 'vain': 1.7160033436347992, ':': 0.8129133566428556, 'either': 1.7160033436347992, 'reach': 1.7160033436347992, 'point': 1.414973347970818, 'higher': 1.7160033436347992, 'today': 1.7160033436347992, ',': 0.37358066281259295, 'train': 1.7160033436347992, 'power': 1.1139433523068367, 'abl': 1.7160033436347992, 'tomorrow.': 1.7160033436347992, '”': 1.0170333392987803, '—': 1.2388820889151366, 'friedrich': 1.7160033436347992, 'nietzsch': 1.7160033436347992, 'challeng': 1.414973347970818, 'setback': 1.414973347970818, 'meant': 1.7160033436347992, 'defeat': 1.2388820889151366, 'promot': 1.7160033436347992, '.': 0.06279082985945544}, 'However, I real': {'howev': 1.414973347970818, ',': 0.37358066281259295, 'realis': 1.41

# Calculate TF-IDF and generate a matrix

In [84]:
def create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}
    
    for (sentence1, freq_table1), (sentence2, freq_table2) in zip(tf_matrix.items(), idf_matrix.items()):
        
        tf_idf_table = {}
        
        for (word1, value1), (word2, value2) in zip(freq_table1.items(), freq_table2.items()):
            tf_idf_table[word1] = float(value1 * value2)
            
        tf_idf_matrix[sentence1] = tf_idf_table
        
    return tf_idf_matrix

tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'Those Who Are R': {'resili': 0.04161686317561229, 'stay': 0.04161686317561229, 'game': 0.036437708497504016, 'longer': 0.04161686317561229, '“': 0.029912745273493538, 'mountain': 0.050470686577494095, 'truth': 0.050470686577494095, 'never': 0.04161686317561229, 'climb': 0.10094137315498819, 'vain': 0.050470686577494095, ':': 0.023909216371848695, 'either': 0.050470686577494095, 'reach': 0.050470686577494095, 'point': 0.04161686317561229, 'higher': 0.10094137315498819, 'today': 0.050470686577494095, ',': 0.021975333106623113, 'train': 0.050470686577494095, 'power': 0.03276303977373049, 'abl': 0.050470686577494095, 'tomorrow.': 0.050470686577494095, '”': 0.029912745273493538, '—': 0.036437708497504016, 'friedrich': 0.050470686577494095, 'nietzsch': 0.050470686577494095, 'challeng': 0.04161686317561229, 'setback': 0.04161686317561229, 'meant': 0.050470686577494095, 'defeat': 0.036437708497504016, 'promot': 0.050470686577494095, '.': 0.0018467891135133955}, 'However, I real': {'howev': 0

# Score each sentence

In [85]:
def score_sentences(tf_idf_matrix) -> dict:
    sentenceValue = {}
    
    for sentence, score_table in tf_idf_matrix.items():
        total_score = 0
        
        diff_words_in_sentence = len(score_table)
        # total_num_wrods_in_sentence = sum([freq_table[t] for t in freq_table])
        
        
        for word, score in score_table.items():
            total_score += score
            
        sentenceValue[sentence] = total_score / diff_words_in_sentence
        
    return sentenceValue;

sentenceScores = score_sentences(tf_idf_matrix)
print(sentenceScores)

{'Those Who Are R': 0.04512750672425487, 'However, I real': 0.08590242763976694, 'Have you experi': 0.3239232585727256, 'To be honest, I': 0.16316926181026162, 'I can’t tell yo': 0.12383203821623005, 'However, it’s i': 0.07277387444980964, 'To a person wit': 0.07111516510658022, 'Same failure, y': 0.16444926737498997, 'Who is right an': 0.41864430991031015, 'Neither.': 0.4446985433735637, 'Each person has': 0.17094978895160579, 'Those who are r': 0.1420681030893123, 'I’ve coached ma': 0.10393055918383491, 'It was at that ': 0.2233540987973747, 'Perhaps all tho': 0.2149921544733752, 'It was the 19th': 0.03848143604355683, 'Consider the ad': 0.056951467901284525, 'Even more than ': 0.04201892102073854, 'Some of you rea': 0.13147637984349306, 'For others, at ': 0.13562338250422484, 'What I wish to ': 0.1371552634760773, 'If you settle f': 0.13757544687068993, '“Two people on ': 0.029089753809016194, 'Don’t leave it ': 0.1076945290795475, 'It must come fr': 0.28804630433974315, 'Gnaw away 

# Calculating threshold value used for choosing important sentences

In [86]:
def find_average_score(sentenceScores) -> int:
    sumScores = sum([sentenceScores[entry] for entry in sentenceScores])
    average = (sumScores / len(sentenceScores))
    
    return average

threshold = find_average_score(sentenceScores)        
print(threshold)

0.15200159981023859


# Generate the summary

In [87]:
def generate_summary(sentences, sentenceScores, threshold):
    sentence_count = 0
    summary = ''
    
    for sentence in sentences:
        if sentence[:15] in sentenceScores and sentenceScores[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1
            
    return summary

summary = generate_summary(text, sentenceScores, threshold)
print(summary)

 Have you experienced this before? To be honest, I don’t have the answers. Same failure, yet different responses. Who is right and who is wrong? Neither. Each person has a different mindset that decides their outcome. It was at that point their biggest breakthrough came. Perhaps all those years of perseverance finally paid off. It must come from within you. Gnaw away at your problems until you solve them or find a solution. Where are you settling in your life right now? Could you be you playing for bigger stakes than you are? Success is a fickle and long game with highs and lows. So become intentional on what you want out of life. Commit to it. Nurture your dreams. Don’t leave your dreams to chance.


# Test with dataset

In [93]:
# Get text from file
with open("testarticle.txt", "r", encoding="utf-8") as file:
    text = file.read()

text = sent_tokenize(text)
print("Text:")
print(text[:20], "\n")
total_documents = len(text)

# Calculate Frequency Matrix
freq_matrix = create_frequency_matrix(text)
# print("Frequency Matrix:")
# print(list(freq_matrix)[:5], "\n")

# Create Term Frequency Matrix
tf_matrix = create_tf_matrix(freq_matrix)
# print("Term Frequency Matrix:")
# print(list(tf_matrix)[:5], "\n")

# Create Document counts per word Matrix
doc_per_word = create_documents_per_words(freq_matrix)
# print("Number of documents each word appear in:")
# print(list(doc_per_word)[:5], "\n")

# Create Inverse Document Frequency Matrix
idf_matrix = create_idf_matrix(freq_matrix, doc_per_word, total_documents)
# print("IDF Matrix:")
# print(list(idf_matrix)[:5], "\n")

# Create TF-IDF matrix
tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)
# print("TF-IDF Matrix:")
# print(list(tf_idf_matrix)[:5], "\n")

# Score each sentence
sentenceScores = score_sentences(tf_idf_matrix)
# print("Scores of each sentence:")
# print(list(sentenceScores)[:5], "\n")

# Calculate the threshold to select important sentences for summary
threshold = find_average_score(sentenceScores)
# print("threshold:")
# print(threshold, "\n")

# Generate the summary
summary = generate_summary(text, sentenceScores, threshold)
print("Summary:")
print(summary)

Text:
['For those who had academic writing, summarization — the task of producing a concise and fluent summary while preserving key information content and overall meaning — was if not a nightmare, then a constant challenge close to guesswork to detect what the professor would find important.', 'Though the basic idea looks simple: find the gist, cut off all opinions and detail, and write a couple of perfect sentences, the task inevitably ended up in toil and turmoil.', 'On the other hand, in real life we are perfect summarizers: we can describe the whole War and Peace in one word, be it “masterpiece” or “rubbish”.', 'We can read tons of news about state-of-the-art technologies and sum them up in “Musk sent Tesla to the Moon”.', 'We would expect that the computer could be even better.', 'Where humans are imperfect, artificial intelligence depraved of emotions and opinions of its own would do the job.', 'The story began in the 1950s.', 'An important research of these days introduced a me