### pyTextRank

pyTextRank Github Link - https://github.com/ceteri/pytextrank

Other sources - https://rare-technologies.com/text-summarization-in-python-extractive-vs-abstractive-techniques-revisited/

This program runs pyTextRank

Version 2 is only running the 30% testing dataset (Final version)

In [1]:
# Logging Module
import datetime
import logging
    
logger = logging.getLogger()
    
def setup_file_logger(log_file):
    hdlr = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr) 
    logger.setLevel(logging.INFO)
    
def log(message):
    #outputs to Jupyter console
    #print('{} {}'.format(datetime.datetime.now(), message))
    #outputs to file
    logger.info(message)
    
setup_file_logger('failed_pyTextRank.log')

In [2]:
#Settings
dataset_folder_path = "/home/s10166858/kawaijoe/Dataset/cnn-dailymail-master/extractive_dataset"
file_prefix = ""
file_suffix = ".json"

In [3]:
def run_textrank(file_path, new_filepath, extractive_path):
    
    ### STAGE 1 ###
    # Perform statistical parsing/tagging on a document in JSON format
    # Part-of-Speech Tagging and lemmatization is performed for every sentence in the document
    import pytextrank
    import sys

    path_stage0 = extractive_path + '/' + file_path
    path_stage1 = "o1.json"

    # Parse one document to prep for TextRank (parse_graf)
    # For each sentence outputs -  ParsedGraf(id, sha1, graf)
    # graf - word in text - E.g. [17, "bounds", "bound", "NNS", 1, 34]
    # stored to o1.json
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
            # to view output in this notebook
            # print(pytextrank.pretty_print(graf))
            
    
    ### STAGE 2 ###
    # Key phrases are extracted along with their counts, and are normalized
    # Collect and normalize the key phrases from a parsed document (RankedLexeme)
    # E.g. ["minimal generating sets", 0.035356918184280925, [19, 23, 5], "np", 1]
    path_stage1 = "o1.json"
    path_stage2 = "o2.json"

    graph, ranks = pytextrank.text_rank(path_stage1)
    # pytextrank.render_ranks(graph, ranks)

    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
            # to view output in this notebook
            # print(pytextrank.pretty_print(rl))

    # Print a graph
    import networkx as nx
    import pylab as plt

    #nx.draw(graph, with_labels=True) 
    #plt.show()

    ### STAGE 3 ###
    # Calculate a significance weight/score for each sentence 
    # using MinHash to approximate a Jaccard distance from key phrases determined by TextRank
    path_stage1 = "o1.json"
    path_stage2 = "o2.json"
    path_stage3 = "o3.json"

    kernel = pytextrank.rank_kernel(path_stage2)

    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
            # to view output in this notebook
            # print(pytextrank.pretty_print(s._asdict()))

    ### STAGE 4 ###
    # Summarizes the document based on most significant sentences and key phrases.
    path_stage2 = "o2.json"
    path_stage3 = "o3.json"

    phrases = ", ".join(set([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12)]))
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3, sentence_limit=4), key=lambda x: x[1])
    s = []

    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))
    
    graf_text = "\n".join(s)
    #return graf_text, phrases
    #log("**excerpts:** %s\n**keywords:** %s\n\n" % (graf_text, phrases,))
    #log("**excerpts:** %s\n\n" % (graf_text))
    
    # To create summary files
    with open(new_filepath, "a") as f:
        f.write(graf_text.encode('utf-8'))
        
    os.remove("o1.json")
    os.remove("o2.json")
    os.remove("o3.json")

In [None]:
import os
import sys

system_dir = "/home/s10166858/kawaijoe/Extractive Methods/pyTextRank for CNN & DM/All-NLP/system"

file_count = 1
itr = 1

# Removes '.json' from '1.json', '100.json', etc. and sort the numbers
for file in sorted(os.listdir(dataset_folder_path), key=lambda name: int(name[0:-5])):
    try:
        new_filename = "article" + str(file_count) + "_system1.txt"
        new_filepath = os.path.join(system_dir, new_filename)

        if file.endswith(file_suffix):
            run_textrank(file, new_filepath, dataset_folder_path)

            # Count to check number of files ran
            file_count = file_count + 1
    except:
        file_count = file_count + 1
        log(file)
        log("Unexpected error:" + str(sys.exc_info()[0]))
        log("\n")
        continue

print("Running completed!")