# Final Code for Abstractive Based Summary

## Code for Abstractive Type

In [1]:
## Importing required packages...
import logging
from utils import deprecated
from jr_abst.summarization.summarizer import summarize
from jr_abst.summarization.pagerank_weighted import pagerank_weighted as _pagerank
from jr_abst.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from jr_abst.summarization.commons import build_graph as _build_graph
from jr_abst.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
from jr_abst.summarization.bm25 import iter_bm25_bow as _bm25_weights
from math import log10 as _log10
from six.moves import range
from nltk.corpus import wordnet,stopwords

In [3]:
def relation_list(nouns):
    relation_list= defaultdict(list)
    for x in range(len(nouns)):
        relation=[]
        for syns in wordnet.synsets(nouns[x],pos=wordnet.NOUN):
            for lema in syns.lemmas():
                relation.append(lema.name())
                if lema.antonyms():
                    relation.append(lema.antonyms()[0].name())
            for hypnyms in syns.hypernyms():
                if hypnyms.hypernyms():
                    relation.append(hypnyms.hypernyms()[0].name().split('.')[0])
        relation_list[nouns[x]].append(relation)
    #for each in relation_list:
    return relation_list
def LC(nouns, relation_list):
    lexical=[]
    threshold=0.5
    for noun in nouns:
        flag=0
        for lex in range(len(lexical)):
            if flag==0:
                for keys in list(lexical[lex]):
                    if keys == noun and flag==0:
                        lexical[lex][noun]+=1
                        flag=1
                    elif keys in relation_list[noun][0] and flag==0:
                        syn1=wordnet.synsets(keys,pos=wordnet.NOUN)
                        syn2=wordnet.synsets(noun,pos=wordnet.NOUN)
                        if syn1[0].wup_similarity(syn2[0]) >=threshold:
                            lexical[lex][noun]=1
                            flag=1
        if flag==0:
            dic_nuevo={}
            dic_nuevo[noun]=1
            lexical.append(dic_nuevo)
            flag=1
    return lexical

In [37]:
##Summarizer...
INPUT_MIN_LENGTH = 10
WEIGHT_THRESHOLD = 1.e-1
logger = logging.getLogger(__name__)

def _set_graph_edge_weights(graph):
    documents = graph.nodes()
    weights = _bm25_weights(documents)
    for i, doc_bow in enumerate(weights):
        if i % 1000 == 0 and i > 0:
            logger.info('PROGRESS: processing %s/%s doc (%s non zero elements)', i, len(documents), len(doc_bow))
        for j, weight in doc_bow:
            if i == j or weight < WEIGHT_THRESHOLD:
                continue
            edge = (documents[i], documents[j])
            if not graph.has_edge(edge):
                graph.add_edge(edge, weight)

    # Handles the case in which all similarities are zero.
    # The resultant summary will consist of random sentences.
    if all(graph.edge_weight(edge) == 0 for edge in graph.iter_edges()):
        _create_valid_graph(graph)


def _create_valid_graph(graph):
    nodes = graph.nodes()
    for i in range(len(nodes)):
        for j in range(len(nodes)):
            if i == j:
                continue
            edge = (nodes[i], nodes[j])
            if graph.has_edge(edge):
                graph.del_edge(edge)
            graph.add_edge(edge, 1)


def _get_doc_length(doc):
    return sum(item[1] for item in doc)


def _get_similarity(doc1, doc2, vec1, vec2):
    numerator = vec1.dot(vec2.transpose()).toarray()[0][0]
    length_1 = _get_doc_length(doc1)
    length_2 = _get_doc_length(doc2)
    denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0
    return numerator / denominator if denominator != 0 else 0


def _build_corpus(sentences):
    split_tokens = [sentence.token.split() for sentence in sentences]
    dictionary = Dictionary(split_tokens)
    return [dictionary.doc2bow(token) for token in split_tokens]


def _get_important_sentences(sentences, corpus, important_docs):
    hashable_corpus = _build_hasheable_corpus(corpus)
    sentences_by_corpus = dict(zip(hashable_corpus, sentences))
    return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs]


def _get_sentences_with_word_count(sentences, word_count):
    length = 0
    selected_sentences = []

    # Loops until the word count is reached.
    for sentence in sentences:
        words_in_sentence = len(sentence.text.split())

        # Checks if the inclusion of the sentence gives a better approximation to the word parameter.
        if abs(word_count - length - words_in_sentence) > abs(word_count - length):
            return selected_sentences
        selected_sentences.append(sentence)
        length += words_in_sentence
        print(selected_sentences)
    return selected_sentences


def _extract_important_sentences(sentences, corpus, important_docs, word_count):
    important_sentences = _get_important_sentences(sentences, corpus, important_docs)
    return important_sentences \
        if word_count is None \
        else _get_sentences_with_word_count(important_sentences, word_count)


def _format_results(extracted_sentences, split):
    if split:
        return [sentence.text for sentence in extracted_sentences]
    return "\n".join(sentence.text for sentence in extracted_sentences)


def _build_hasheable_corpus(corpus):
    return [tuple(doc) for doc in corpus]


def summarize_corpus(corpus, ratio=0.2):
    hashable_corpus = _build_hasheable_corpus(corpus)

    #The function ends, if the corpus is empty.
    if len(corpus) == 0:
        logger.warning("Input corpus is empty.")
        return []

    if len(corpus) < INPUT_MIN_LENGTH:
        logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)

    logger.info('Building graph')
    graph = _build_graph(hashable_corpus)

    logger.info('Filling graph')
    _set_graph_edge_weights(graph)

    logger.info('Removing unreachable nodes of graph')
    _remove_unreachable_nodes(graph)

    #Warns user to add more text.
    if len(graph.nodes()) < 3:
        logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
        return []

    logger.info('Pagerank graph')
    pagerank_scores = _pagerank(graph)

    logger.info('Sorting pagerank scores')
    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)

    return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]


def summarize(text, ratio=0.2, word_count=None, split=False):
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text)

    if len(sentences) == 0:
        logger.warning("Input text is empty.")
        return [] if split else u""

    if len(sentences) == 1:
        raise ValueError("Input must have more than one sentence")

    if len(sentences) < INPUT_MIN_LENGTH:
        logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH)

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)

    # If couldn't get important docs, the algorithm ends.
    if not most_important_docs:
        logger.warning("Couldn't get relevant sentences.")
        return [] if split else u""

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)

In [41]:
##Keeping all the text files in a list for process...
data=[]
import glob
for list_o_file in glob.iglob('**/*.txt', recursive=True):
    print(list_o_file)
    data.append(list_o_file)
#print(data)
data_len=len(data)

input.txt


In [42]:
print("\nTotal count of text files available are:",data_len)


Total count of text files available are: 1


In [40]:
##Main class...
import os
import time
from os.path import abspath,join,dirname
from inspect import getsourcefile
start1 = time.time()

if __name__=="__main__":
    
    ##I/P file
    for everyfile in data:
        start = time.time()
        ip=everyfile#join(dirname(abspath(getsourcefile(lambda:0))),"*.txt")
        with open(ip,"r",encoding="utf-8") as op:
            Input_text=op.read()
            op.close()
        print("\n*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*")
        print("The input File is:\n")
        print(Input_text)
        print("\n*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*")
        Summarized_data=summarize(Input_text)
        print("\nThe Summarized data for the above txt is:\n",Summarized_data)
        print("\n*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*")
        n=0
        if n<=len(data):
            file=open(os.path.join('Output for Abstractive', os.path.basename(everyfile)) , 'w')
            for s in Summarized_data:
                file.write(s) 
            file.close()
        end = time.time()
        print(f"\n Runtime of the above Summary is : {end - start} seconds")

# Total Time Taken...
end1 = time.time()
print(f"Total Runtime of the program is : {end1 - start1} seconds")        
        


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
The input File is:

﻿We observe today not a victory of party but a celebration of freedom--symbolizing an end as well as a beginning--signifying renewal as well as change. For I have sworn before you and Almighty God the same solemn oath our for bears prescribed nearly a century and three-quarters ago. The world is very different now. For man holds in his mortal hands the power to abolish all forms of human poverty and all forms of human life. And yet the same revolutionary beliefs for which our forebears fought are still at issue around the globe--the belief that the rights of man come not from the generosity of the state but from the hand of God. We dare not forget today that we are the heirs of that first revolution. Let the word go forth from this time and place, to friend and foe alike, that the torch has been passed to a new generation of Americans--born 

## Rouge Test

In [24]:
## Evaluations of Text Summaries...
import os
import time
from os.path import abspath,join,dirname
from inspect import getsourcefile
ip=join(dirname(abspath(getsourcefile(lambda:0))),"input.txt")#r"News Articles\business\001.txt")
with open(ip,"r",encoding="utf-8") as op:
    Input_text=op.read()
    op.close()
print("The Input text file:\n",Input_text)
print("\nThe count of words:\n",len(Input_text))
print("________________")

opt=join(dirname(abspath(getsourcefile(lambda:0))),r"Output for Abstractive\input.txt")
with open(opt,"r",encoding="utf-8") as opx:
    Output_text=opx.read()
    opx.close()
print("The Output text file:\n",Output_text)
print("\nThe count of words:\n",len(Output_text))

The Input text file:
 ﻿We observe today not a victory of party but a celebration of freedom--symbolizing an end as well as a beginning--signifying renewal as well as change. For I have sworn before you and Almighty God the same solemn oath our for bears prescribed nearly a century and three-quarters ago. The world is very different now. For man holds in his mortal hands the power to abolish all forms of human poverty and all forms of human life. And yet the same revolutionary beliefs for which our forebears fought are still at issue around the globe--the belief that the rights of man come not from the generosity of the state but from the hand of God. We dare not forget today that we are the heirs of that first revolution. Let the word go forth from this time and place, to friend and foe alike, that the torch has been passed to a new generation of Americans--born in this century, tempered by war, disciplined by a hard and bitter peace, proud of our ancient heritage--and unwilling to wit

In [25]:
#Applying Rouge..
import rouge
def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


for aggregator in ['Best']:#['Avg', 'Best', 'Individual']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=3,
                           limit_length=True,
                           length_limit=500,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
    hypothesis_1 = Output_text
    references_1 = [Input_text]
    scores = evaluator.get_scores(hypothesis_1, references_1)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()
    

Evaluation with Best
	rouge-1:	P: 65.49	R: 58.09	F1: 61.57
	rouge-2:	P: 38.55	R: 34.18	F1: 36.23
	rouge-3:	P: 34.22	R: 30.33	F1: 32.16
	rouge-l:	P: 53.75	R: 48.63	F1: 51.06
	rouge-w:	P: 35.67	R:  9.08	F1: 14.48



In [10]:
##Backup

In [14]:
# ##Main class
# import os
# from os.path import abspath,join,dirname
# from inspect import getsourcefile

# if __name__=="__main__":
    
#     ##I/P file
#     ip=join(dirname(abspath(getsourcefile(lambda:0))),"input.txt")
#     with open(ip,"r",encoding="utf-8") as op:
#         Input_text=op.read()
#         op.close()
#     print("The input data is:\n",Input_text)
#     print("\n*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*")
#     Summarized_data=summarize(Input_text)
#     print("\nThe Summarized data for the above txt is:\n",Summarized_data)
#     #print(" ",summarize(Input_text))
#     file = open('Output for Abstractive\summaryoutput.txt', 'w')
#     for s in Summarized_data:
#         file.write(s) 
#     file.close()

In [13]:
# import rouge


# def prepare_results(p, r, f):
#     return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


# for aggregator in ['Avg', 'Best', 'Individual']:
#     print('Evaluation with {}'.format(aggregator))
#     apply_avg = aggregator == 'Avg'
#     apply_best = aggregator == 'Best'

#     evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
#                            max_n=4,
#                            limit_length=True,
#                            length_limit=100,
#                            length_limit_type='words',
#                            apply_avg=apply_avg,
#                            apply_best=apply_best,
#                            alpha=0.5, # Default F1_score
#                            weight_factor=1.2,
#                            stemming=True)


#     hypothesis_1 = "King Norodom Sihanouk has declined requests to chair a summit of Cambodia 's top political leaders , saying the meeting would not bring any progress in deadlocked negotiations to form a government .\nGovernment and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen 's party to form a new government failed .\nHun Sen 's ruling party narrowly won a majority in elections in July , but the opposition _ claiming widespread intimidation and fraud _ has denied Hun Sen the two-thirds vote in parliament required to approve the next government .\n"
#     references_1 = ["Prospects were dim for resolution of the political crisis in Cambodia in October 1998.\nPrime Minister Hun Sen insisted that talks take place in Cambodia while opposition leaders Ranariddh and Sam Rainsy, fearing arrest at home, wanted them abroad.\nKing Sihanouk declined to chair talks in either place.\nA U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.\nBut in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.\nLeft out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians.",
#                     "Cambodian prime minister Hun Sen rejects demands of 2 opposition parties for talks in Beijing after failing to win a 2/3 majority in recent elections.\nSihanouk refuses to host talks in Beijing.\nOpposition parties ask the Asian Development Bank to stop loans to Hun Sen's government.\nCCP defends Hun Sen to the US Senate.\nFUNCINPEC refuses to share the presidency.\nHun Sen and Ranariddh eventually form a coalition at summit convened by Sihanouk.\nHun Sen remains prime minister, Ranariddh is president of the national assembly, and a new senate will be formed.\nOpposition leader Rainsy left out.\nHe seeks strong assurance of safety should he return to Cambodia.\n",
#                     ]

#     hypothesis_2 = "China 's government said Thursday that two prominent dissidents arrested this week are suspected of endangering national security _ the clearest sign yet Chinese leaders plan to quash a would-be opposition party .\nOne leader of a suppressed new political party will be tried on Dec. 17 on a charge of colluding with foreign enemies of China '' to incite the subversion of state power , '' according to court documents given to his wife on Monday .\nWith attorneys locked up , harassed or plain scared , two prominent dissidents will defend themselves against charges of subversion Thursday in China 's highest-profile dissident trials in two years .\n"
#     references_2 = "Hurricane Mitch, category 5 hurricane, brought widespread death and destruction to Central American.\nEspecially hard hit was Honduras where an estimated 6,076 people lost their lives.\nThe hurricane, which lingered off the coast of Honduras for 3 days before moving off, flooded large areas, destroying crops and property.\nThe U.S. and European Union were joined by Pope John Paul II in a call for money and workers to help the stricken area.\nPresident Clinton sent Tipper Gore, wife of Vice President Gore to the area to deliver much needed supplies to the area, demonstrating U.S. commitment to the recovery of the region.\n"

#     all_hypothesis = [hypothesis_1, hypothesis_2]
#     all_references = [references_1, references_2]

#     scores = evaluator.get_scores(all_hypothesis, all_references)

#     for metric, results in sorted(scores.items(), key=lambda x: x[0]):
#         if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
#             for hypothesis_id, results_per_ref in enumerate(results):
#                 nb_references = len(results_per_ref['p'])
#                 for reference_id in range(nb_references):
#                     print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
#                     print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
#             print()
#         else:
#             print(prepare_results(results['p'], results['r'], results['f']))
#     print()