# Read in the data

In [None]:
import os
import sys
import re

In [None]:
def read_data(file):
    '''
    Read each text file into a string
    '''
    f = open(file, 'r', encoding='utf-8', errors='ignore')
    text = f.read()
    f.close()
    return text

def folder_list(path):
    '''
    Reads each text file in a folder and concatenates each file into a bigger string
    Parameter 'path' is the path of your local folder
    '''
    filelist = os.listdir(path)
    text = []
    for infile in filelist:
        file = os.path.join(path, infile)
        text_data = read_data(file)
        text += [text_data]
    return text

In [None]:
data = folder_list('clustered_articles/')
#data = folder_list('test/')
#data = read_data('clustered_articles/cluster_9.txt')

In [None]:
data[1]

# TF-IDF

In [None]:
## run once ##
#!pip install nltk

In [None]:
## run once, install everything ##
#import nltk
#import ssl
#
#try:
#    _create_unverified_https_context = ssl._create_unverified_context
#except AttributeError:
#    pass
#else:
#    ssl._create_default_https_context = _create_unverified_https_context
#
#nltk.download()

In [None]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

In [None]:
# sourced from - https://towardsdatascience.com/text-summarization-using-tf-idf-e64a0644ace3
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.
    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable


def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table


def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents/float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix


def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(),idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [None]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence/count_words_in_sentence

    return sentenceValue


def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues/len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]]>=(threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

# s is a scale for the threshold -> higher values means only higher rated TF-IDF 
# sentences get chosen (more strict).
def run_summarization(text, s):
    """
    :param text: Plain summary_text of long article
    :return: summarized summary_text
    """

    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''
    # 1 Sentence Tokenize
    sentences = sent_tokenize(text)
    total_documents = len(sentences)
    #print(sentences)

    # 2 Create the Frequency matrix of the words in each sentence.
    freq_matrix = _create_frequency_matrix(sentences)
    #print(freq_matrix)

    '''
    Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
    '''
    # 3 Calculate TermFrequency and generate a matrix
    tf_matrix = _create_tf_matrix(freq_matrix)
    #print(tf_matrix)

    # 4 creating table for documents per words
    count_doc_per_words = _create_documents_per_words(freq_matrix)
    #print(count_doc_per_words)

    '''
    Inverse document frequency (IDF) is how unique or rare a word is.
    '''
    # 5 Calculate IDF and generate a matrix
    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
    #print(idf_matrix)

    # 6 Calculate TF-IDF and generate a matrix
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    #print(tf_idf_matrix)

    # 7 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(tf_idf_matrix)
    #print(sentence_scores)

    # 8 Find the threshold
    threshold = _find_average_score(sentence_scores)
    #print(threshold)

    # 9 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, s*threshold)
    return summary

In [None]:
for cluster_idx in range(len(data)):
    result = run_summarization(data[cluster_idx], 2)
    print(str(cluster_idx) + ':')
    print(result)
    print()

In [None]:
result = run_summarization(data[0], 2)
print(result)

# Transformers

In [None]:
## run once ##
#!pip install tensorflow torch  # make sure either tensorflow or torch is installed
#!pip install transformers

In [None]:
from transformers import pipeline

In [9]:
# Initialize the HuggingFace summarization pipeline
# note: model is developed by facebook (fine-tuned using CNN - a news summarization dataset)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", max_length=2**31, truncation=True) # summarization uses BART model


for cluster_idx in range(len(data)):
    result = summarizer(data[cluster_idx], min_length=75, max_length=300)
    print(str(cluster_idx) + ':')
    print(result)
    print()


0:
[{'summary_text': 'Qatar Airways flight to Doha marks a breakthrough in the bumpy coordination between the U.S. and Afghanistans new Taliban rulers. A dayslong standoff over charter planes at another airport has left dozens of passengers stranded. It was not immediately clear how many Americans were on board Thursday and how many were still in Afghanistan. The Taliban have repeatedly said foreigners and Afghans with proper travel documents could leave.'}]

1:
[{'summary_text': 'John Hinckley Jr. was 25 when he shot and wounded the 40th U.S. president outside a Washington hotel. He was found not guilty by reason of insanity in 1981. A federal judge says he can be freed from all remaining restrictions next year if he continues to follow those rules and remains mentally stable. The Ronald Reagan Presidential Foundation says it is "saddened" by the court\'s plan.'}]

2:
[{'summary_text': 'A new Discovery+ series, "Curse of the Chippendales," exposes the intimate details of owner Steve B

21:
[{'summary_text': '"Aulani, Lei Hiwahiwa" written and performed by two of our very own Aulani cast members: Angela Morales and Eric Lee. Rocio Cintron, Product Design Manager for Disney Preschool, shared how she infuses her background into the work she does, which allows her to learn and celebrate other cultures as well. In describing her favorite Disney treat at the Parks, Rocio said her favorite ride at Disneyland is Mad Tea Party because the teacups are sweet and colorful.'}]

22:
[{'summary_text': "A new ABC News/Ipsos poll finds Americans' confidence in Joe Biden is eroding. The president's overall approval rating has worsened since late August. Dissatisfaction among Republicans and independents is fueling the decline. A bill to rebuild the nation's infrastructure is the most pressing priority for Biden this week, as it will be brought up for a vote on Thursday."}]

23:
[{'summary_text': 'John Hinckley Jr. could be released from all court supervision by June 2022, if he contin

Your max_length is set to 300, but you input_length is only 242. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


36:
[{'summary_text': 'President Joe Biden meets with House Democrats, White House officials say. His move comes as Democrats are at an impasse over passing his agenda. House Speaker Nancy Pelosi has twice had to delay a vote on a $1.2 trillion bipartisan infrastructure plan. progressive Democrats are vowing to defeat it unless they also get aVote on $3.5 trillion social safety net and climate policy measure.'}]

37:
[{'summary_text': 'Dr. Anthony Fauci defended the White House\'s plan to provide COVID-19 vaccine booster shots before the Food and Drug Administration voted to only provide those shots to Americans 65 and older and immunocompromised. "The goal of this particular decision was to prevent people from getting serious disease who are at risk, such as the elderly and those that have underlying conditions," he said.'}]

38:
[{'summary_text': 'A federal judge approved a plan Monday to unconditionally release John Hinckley Jr. from all remaining court-ordered restrictions, if he c

# Transformer Translation (Extra)

In [None]:
# some translation testing with transformer
translator = pipeline("translation_en_to_de")
print(translator("".join(list(summarized[0].values())), max_length=300))