# Read in the data

In [1]:
import os
import sys
import re

In [2]:
def read_data(file):
    '''
    Read each text file into a string
    '''
    f = open(file, 'r', encoding='utf-8', errors='ignore')
    text = f.read()
    f.close()
    return text

def folder_list(path):
    '''
    Reads each text file in a folder and concatenates each file into a bigger string
    Parameter 'path' is the path of your local folder
    '''
    filelist = os.listdir(path)
    text = []
    for infile in filelist:
        file = os.path.join(path, infile)
        text_data = read_data(file)
        text += [text_data]
    return text

In [3]:
data = folder_list('clustered_articles/')
#data = read_data('clustered_articles/cluster_9.txt')

In [13]:
data[5]

'"It has brought us to the breaking point," one hospital official said. In January, Alaska had the highest per capita coronavirus vaccination rate in the nation. Now, hospitals are overwhelmed with COVID-19 patients, and the states largest hospital is rationing care. Vaccine hesitancy and the delta variant have pushed the states fragile and limited hospital system to the breaking point. Providence Alaska Medical Center, the states largest hospital, released a letter to the public Tuesday saying that more than 30% of its patients have COVID-19 and the hospital is rationing treatment. "While we are doing our utmost, we are no longer able to provide the standard of care to each and every patient who needs our help," wrote Chief of Staff Kristen Solana Walkinshaw on behalf of the hospitals Medical Executive Committee. "The acuity and number of patients now exceeds our resources and our ability to staff beds with skilled caregivers, like nurses and respiratory therapists." Of Alaska\'s 120 

# TF-IDF

In [64]:
## run once ##
#!pip install nltk

In [4]:
## run once, install everything ##
#import nltk
#import ssl
#
#try:
#    _create_unverified_https_context = ssl._create_unverified_context
#except AttributeError:
#    pass
#else:
#    ssl._create_default_https_context = _create_unverified_https_context
#
#nltk.download()

In [5]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

In [6]:
# sourced from - https://towardsdatascience.com/text-summarization-using-tf-idf-e64a0644ace3
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.
    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable


def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table


def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents/float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix


def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(),idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [7]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence/count_words_in_sentence

    return sentenceValue


def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues/len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]]>=(threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

# s is a scale for the threshold -> higher values means only higher rated TF-IDF 
# sentences get chosen (more strict).
def run_summarization(text, s):
    """
    :param text: Plain summary_text of long article
    :return: summarized summary_text
    """

    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''
    # 1 Sentence Tokenize
    sentences = sent_tokenize(text)
    total_documents = len(sentences)
    #print(sentences)

    # 2 Create the Frequency matrix of the words in each sentence.
    freq_matrix = _create_frequency_matrix(sentences)
    #print(freq_matrix)

    '''
    Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
    '''
    # 3 Calculate TermFrequency and generate a matrix
    tf_matrix = _create_tf_matrix(freq_matrix)
    #print(tf_matrix)

    # 4 creating table for documents per words
    count_doc_per_words = _create_documents_per_words(freq_matrix)
    #print(count_doc_per_words)

    '''
    Inverse document frequency (IDF) is how unique or rare a word is.
    '''
    # 5 Calculate IDF and generate a matrix
    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
    #print(idf_matrix)

    # 6 Calculate TF-IDF and generate a matrix
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    #print(tf_idf_matrix)

    # 7 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(tf_idf_matrix)
    #print(sentence_scores)

    # 8 Find the threshold
    threshold = _find_average_score(sentence_scores)
    #print(threshold)

    # 9 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, s*threshold)
    return summary

In [8]:
for cluster_idx in range(len(data)):
    result = run_summarization(data[cluster_idx], 2)
    print(str(cluster_idx) + ':')
    print(result)
    print()

0:
 Flights are restricted to daytime hours. Some of the other defendants wore paramilitary attire. "Why create an appellate issue? Just one floor above her. He carried her shoes the rest of the way down. They helped each other out. And we knew. It all came flooding back. ET on ABC.

1:
 Hinckley has been barred from having a gun. She died in July. "And well have actual hard data. Hinckley was suffering from acute psychosis. Hes also shared his music on YouTube. She denied Blumenthal's claims. That included a demonstration of voting equipment. Former New Jersey Gov. "It is happening across the country. Its not something that just popped up." A third lead attorney echoed both Sowards and Bormann. The final team deferred for now. The public portion of the proceedings will resume on Monday. "S.B. That's what the statute is designed to do. Pitman said. "We do contest the validity of service to date. Mr. Boies is aware of this. There was no legal explanation for the decision. There are more

5:
 Gov. That leaves very limited options. Bronsons office released a statement after the meeting. That never happened. It was a move that shocked and angered parents. Iowas Republican Gov. Four firefighters were injured while responding to the scene. He did not elaborate on the initial investigation into those reports. They were both in incubators because they were born underweight. Neither woman was identified. But everything I know is American. Some families have already experienced slow-rolling family separations under the policy. Nursing homes were limiting admissions. "Weve been planning all along. Carrington said. ------ Hill reported from Albany. A similar condition can occur in adults. Submit them at: FactCheck@AP.org. Read more here: What is the mu variant of the coronavirus? What can employers do if workers avoid COVID-19 vaccines? Do I need a booster if I got the Johnson & Johnson vaccine?

6:
 Anybody could have been hurt." No other injuries were reported. Another man had 

11:
 No, but there might be delays. Do we know for sure what services will stop? Yes, and no. How many workers will be affected? "They have rent to pay. Will a shutdown affect the economy? "But it is always is a possibility.

12:
 My mom when she picked me up... she was just so frantic... Thats how life got to be without him.

13:
 They helped each other out. And we knew. It all came flooding back. ET on ABC.

14:
 Are you a teacher? Do you inspire imagination in your classroom? NO PURCHASE NECESSARY. Void where prohibited. 1. 2. The best news? 4. NO PURCHASE NECESSARY. Void where prohibited. 5. 6. The figure is available now at Target. The collection is available now at us.ciatelondon.com. The set is available at BoxLunch.com. NO PURCHASE NECESSARY. ET. Void where prohibited. NO PURCHASE NECESSARY. ET. Void where prohibited. NO PURCHASE NECESSARY. ET. Void where prohibited. "Thats not my job. Maybe easier. It's no bull. And cows do pee a lot. 2. The AP is solely responsible for all co

21:
 Keep reading for some inspiration you wont want to miss! Rocio said. What is Sidneys favorite attraction at Disneyland? Sidney said. What was the goal of the book? What is your favorite section of the book? That might surprise a few readers. TIM: I love all of it! I also love our 15-pages of endnotes. All have one thing in common. For more information, visit www.DisneyBooks.com. Funding will be raised through private donations. Im no exception." The family didn't. Congratulations Lily and Tony! Its an honor to be considered among the best of the best. Its like being with family. For more information, visit DisneySprings.com/Cirque. Carla noted. She is a fox with an inquisitive mind and the intelligence that supports it. The possibilities are endless!" Ask yourself, 'what drives you? The rest will come. Talk about a full circle!

22:
 That is down from 3,805 a year earlier and 3,785 in 2018. It peaked at 31,255 in 1967. This birth is a true milestone for this endangered species. An

32:
 Other experts have stressed that size isn't everything. Sony Salzman is the unit's coordinating producer. They certainly are. Maybe easier. It's no bull. Some kids take quite a bit longer. And cows do pee a lot. "The critical question is can it and will it scale?" There are a couple caveats to this experiment. 2, they didnt do No. 2. The AP is solely responsible for all content. Submit them at: FactCheck@AP.org. We have children. Whos going to raise these kids?" They work by interfering with the viruss ability to replicate in human cells. They would be another tool to fight covid. Atea and Pfizer have not released similar estimates. Even more promising? Kaiser Health News is a nonprofit news service covering health issues. We owe them a commitment to follow the science.

33:
 Hinckley has been barred from having a gun. She died in July. Hinckley was suffering from acute psychosis. Hes also shared his music on YouTube. "Everybody is frustrated. Two. We have the responsibility and t

44:
 The victim refused medical attention. The investigation remains ongoing. He stood by his confirmation of the plan. That never happened. It was a move that shocked and angered parents. Iowas Republican Gov. Paid time off is key. These acts of proactive leadership are essential to reduce risks for all. Benjy Renton is an analyst at Ariadne Labs. "I could not do anything but blink. I could not talk, did not raise my head up off the pillow. "COVID doesnt care. But some criticized Republican Gov. U.S. "This is not an unusual phenomenon. he asked. But it really does depend on us and what we do together." "S.B.

45:
 They would like all of the apartments to be affordable.

46:

47:

48:
 Opposition could form again. "They get to tell their stories. They need to have a platform to be heard on. About 2,000 have been moved so far. U.S. he asked.



# Sklearn tf-idf

In [21]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform([data])

vocab = vectorizer.get_feature_names()



In [16]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(corpus).toarray()

In [27]:
def get_average(values) :
    greater_than_zero_count = total = 0
    for value in values :
        if value != 0 :
            greater_than_zero_count += 1
            total += value 
    return total / greater_than_zero_count

def get_threshold(tfidf_results) :
    i = total = 0
    while i < (tfidf_results.shape[0]) :
        total += get_average(tfidf_results[i, :].toarray()[0])
        i += 1
    return total / tfidf_results.shape[0]

def get_summary(documents, tfidf_results) :
    summary = ""
    i = 0
    while i < (tfidf_results.shape[0]) :
        if (get_average(tfidf_results[i, :].toarray()[0])) >= get_threshold(tfidf_results) * HANDICAP :
                summary += ' ' + documents[i]
        i += 1
    return summary

In [28]:
print(get_summary([data],tfidf))

TypeError: 'numpy.float64' object is not iterable

# Transformers

In [8]:
## run once ##
#!pip install tensorflow torch  # make sure either tensorflow or torch is installed
#!pip install transformers

In [10]:
from transformers import pipeline

In [11]:
# Initialize the HuggingFace summarization pipeline
summarizer = pipeline("summarization", max_length=2**31, truncation=True) # summarization uses BART model

for cluster_idx in range(len(data)):
    result = summarizer(data[cluster_idx], min_length=75, max_length=300)
    print(str(cluster_idx) + ':')
    print(result)
    print()


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


0:
[{'summary_text': ' Qatar Airways flight to Doha marks a breakthrough in the bumpy coordination between the U.S. and Afghanistans new Taliban rulers . Official: Two senior Taliban officials helped facilitate the departure -- the new foreign minister and deputy prime minister . It was not immediately clear how many Americans were on board Thursday and how many were still in Afghanistan . The Taliban have repeatedly said foreigners and Afghans with proper travel documents could leave .'}]

1:
[{'summary_text': ' Judge says John Hinckley Jr. can be freed from all remaining restrictions next year if he continues to follow those rules and remains mentally stable . Judge says he has displayed no symptoms of active mental illness, no violent behavior and no interest in weapons since 1983 . U.S. government had previously opposed ending restrictions, but took a different position Monday, with attorneys saying they would agree to unconditional release if he follows the rules and shows mental 

18:
[{'summary_text': ' Joffreys coffee is the official coffee of Disneyland Resort and Walt Disney World Resort . The first cup of coffee was poured at Disneys Typhoon Lagoon Water Park back in 1995 . For a limited time, guests can enjoy a blend specifically crafted in honor of the 50th anniversary celebration . Remys Ratatouille Adventure will open on Oct. 1, just in time for The Worlds Most Magical Celebration .'}]

19:
[{'summary_text': ' A Florida man has planted a banana tree in a pothole near his business to warn motorists away . Bryan Raymond planted the tree last week along a private road near his Fort Myers, Florida, business . "If we have to maintain it and make sure nobody gets hurt, we are going to put something obvious there," he says . Michael Sussmann\'s attorneys maintain his innocence .'}]

20:
[{'summary_text': ' Jesse Benton, 43, of Louisville, Kentucky, was accused in an indictment unsealed Monday in federal court in Washington . Trump pardoned Benton in December a

Your max_length is set to 300, but you input_length is only 242. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


36:
[{'summary_text': ' Biden: "Im telling you, were gonna get this done. It doesnt matter when. Were gonna get it done" Biden meets with House Democrats on Capitol Hill to make the case for his legislative agenda . House Speaker Nancy Pelosi has twice had to delay a vote on a $1.2 trillion infrastructure plan Biden supports because progressive Democrats are vowing to defeat it unless they get a social safety net and climate policy measure he also supports .'}]

37:
[{'summary_text': ' The goal of this particular decision was to prevent people from getting serious disease who are at risk, such as the elderly and those that have underlying conditions," he said . Fauci said that people need to understand that such decisions depend on science and approvals by the appropriate health agencies . The FDA booster decision shows the process worked, he said, and he was not disappointed by the panel\'s decision .'}]

38:
[{'summary_text': ' A federal judge approved a plan to unconditionally relea

# Transformer Translation (Extra)

In [41]:
# some translation testing with transformer
translator = pipeline("translation_en_to_de")
print(translator("".join(list(summarized[0].values())), max_length=300))

No model was supplied, defaulted to t5-base (https://huggingface.co/t5-base)


[{'translation_text': 'Die muslimischen Amerikaner, die unter dem Schatten des 11. September aufwuchsen, waren mit Feindseligkeit und Überwachung konfrontiert. Viele waren mit Verdacht, Fragen über ihren Glauben und Zweifeln über ihre Amerikanerschaft konfrontiert.'}]
