# Final Code for Extractive Based Summary

## Code for Extractive Type

In [59]:
## Importing required packages...
import nltk
import string
import os
import networkx as nx
import matplotlib.pyplot as plt
from nltk.wsd import lesk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize,RegexpTokenizer,sent_tokenize
from nltk.corpus import wordnet,stopwords
from string import punctuation
from inspect import getsourcefile
from collections import defaultdict
from os.path import abspath,join,dirname
from heapq import nlargest
from gensim.summarization import summarize
import time
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#from nltk.corpus import wordnet as wn
stop_words = set(stopwords.words('english'))

In [60]:
## List of Relations of each nouns...
def relation_list(nouns):
    relation_list= defaultdict(list)
    for x in range(len(nouns)):
        relation=[]
        for syns in wordnet.synsets(nouns[x],pos=wordnet.NOUN):
            for lema in syns.lemmas():
                relation.append(lema.name())
                if lema.antonyms():
                    relation.append(lema.antonyms()[0].name())
            for hypnyms in syns.hypernyms():
                if hypnyms.hypernyms():
                    relation.append(hypnyms.hypernyms()[0].name().split('.')[0])
        relation_list[nouns[x]].append(relation)
    #for each in relation_list:
    print("\n Relation List")
    print("\n ",relation_list)
    return relation_list

In [61]:
## Computing LC to Nouns and their relations
##Also applying threshold of similarity
def LC(nouns, relation_list):
    lexical=[]
    threshold=0.5
    for noun in nouns:
        flag=0
        for lex in range(len(lexical)):
            if flag==0:
                for keys in list(lexical[lex]):
                    if keys == noun and flag==0:
                        lexical[lex][noun]+=1
                        flag=1
                    elif keys in relation_list[noun][0] and flag==0:
                        syn1=wordnet.synsets(keys,pos=wordnet.NOUN)
                        syn2=wordnet.synsets(noun,pos=wordnet.NOUN)
                        if syn1[0].wup_similarity(syn2[0]) >=threshold:
                            lexical[lex][noun]=1
                            flag=1
        if flag==0:
            dic_nuevo={}
            dic_nuevo[noun]=1
            lexical.append(dic_nuevo)
            flag=1
    print("\nLexical ",lexical)
    return lexical

In [62]:
## Pruning the LC, by deleting the chains which are weak...
def Prune(lexical):
    chain_final=[]
    while lexical:
        reslt=lexical.pop()
        if len(reslt.keys())==1:
            for values in reslt.values():
                if values!=1:
                    chain_final.append(reslt)
        else:
            chain_final.append(reslt)
    print("\n Final Chain")
    print("\n ",chain_final)
    return chain_final

In [63]:
## Summary class...
class Summary:
    def __init__(self,thld_min=.1,thld_max=0.9):
        self.thld_min=thld_min
        self.thld_max=thld_max
        self._stopwords=set(stopwords.words('english')+list(punctuation));
    ##Calculate freq of every word in the document, along with the LC.   
    def frequency(self,words,LC):
        freq=defaultdict(int)
        for wrd in words:
            for w in wrd:
                if w not in self._stopwords:
                    flag=0
                    for i in LC:
                        if w in list(i.keys()):
                            freq[w]=sum(list(i.values()))
                            flag=1
                            break
                    if flag==0:
                        freq[w]+=1
        mx=float(max(freq.values()))
        for w in list(freq.keys()):
            freq[w]=freq[w]/mx
            if freq[w]>=self.thld_max or freq[w]<=self.thld_min:
                del freq[w]
       # print("freq ",freq)
        return freq
    
    ##Final Summary using heap of important sentences.
    def summarize(self,statements,LC,size):
        assert size<= len(statements)
        wrd_statements=[word_tokenize(s.lower()) for s in statements]
        self.freq=self.frequency(wrd_statements,LC)
        Rank=defaultdict(int)
        for i,stmt in enumerate(wrd_statements):
            for words in stmt:
                if words in self.freq:
                    Rank[i]+=self.freq[words]
                    index=self.rank(Rank,size)
        final_index=sorted(index)
        #z=(statements[j] for j in final_index)
        #print("z ",z)
        return[statements[j] for j in final_index]
        
    ##Creating Heap of best sentences and returning it...
    def rank(self,Rank,size):
        return nlargest(size,Rank,key=Rank.get)

In [32]:
##Main class
start = time.time()

if __name__=="__main__":
    
    ##I/P file
    ip=join(dirname(abspath(getsourcefile(lambda:0))),"input.txt")
    with open(ip,"r",encoding="utf-8") as op:
        Input_text=op.read()
        op.close()
    print(Input_text)
        
    ##Provides the Nouns, Adverbs, Adjectives for the whole document.
    posit=['NN','NNS','NNP','NNPS','RB','JJ']
    sent=nltk.sent_tokenize(Input_text)
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=[tokenizer.tokenize(w) for w in sent]
    tagged=[pos_tag(tok) for tok in tokens]
    nouns=[word.lower() for i in range(len(tagged)) for word, pos in tagged[i] if pos in posit]
    relation=relation_list(nouns)
    lexical=LC(nouns, relation)
    chain_final=Prune(lexical)
    
    ##Printing the LC...
    print("\n Printing the LC")
    for i in range(len(chain_final)):
        x="Chain"+str(i+1)+":"+str(chain_final[i])
        print("\n ",x)

end = time.time()

# total time taken
print(f"Runtime of the program is : {end - start} seconds")

﻿We observe today not a victory of party but a celebration of freedom--symbolizing an end as well as a beginning--signifying renewal as well as change. For I have sworn before you and Almighty God the same solemn oath our for bears prescribed nearly a century and three-quarters ago. The world is very different now. For man holds in his mortal hands the power to abolish all forms of human poverty and all forms of human life. And yet the same revolutionary beliefs for which our forebears fought are still at issue around the globe--the belief that the rights of man come not from the generosity of the state but from the hand of God. We dare not forget today that we are the heirs of that first revolution. Let the word go forth from this time and place, to friend and foe alike, that the torch has been passed to a new generation of Americans--born in this century, tempered by war, disciplined by a hard and bitter peace, proud of our ancient heritage--and unwilling to witness or permit the slo


Lexical  [{'today': 3}, {'not': 19}, {'victory': 1}, {'party': 1}, {'celebration': 1}, {'freedom': 4, 'liberty': 1}, {'end': 1}, {'as': 2}, {'well': 3}, {'beginning': 1}, {'renewal': 1}, {'change': 1, 'undoing': 1}, {'almighty': 1}, {'god': 3}, {'same': 3}, {'solemn': 1}, {'bears': 1}, {'nearly': 1}, {'century': 2}, {'quarters': 1}, {'ago': 1}, {'world': 8, 'man': 1, 'human': 1, 'globe': 1, 'mankind': 1, 'earth': 1}, {'very': 1}, {'different': 1}, {'now': 2}, {'mortal': 1}, {'power': 5, 'powers': 1}, {'forms': 2}, {'poverty': 3}, {'life': 3}, {'yet': 2}, {'revolutionary': 1}, {'beliefs': 1}, {'forebears': 1}, {'still': 1}, {'issue': 1}, {'belief': 1}, {'rights': 2}, {'generosity': 1}, {'state': 1, 'place': 1, 'home': 1, 'states': 1, 'places': 1}, {'hand': 2}, {'heirs': 1}, {'first': 4}, {'revolution': 2}, {'word': 2}, {'forth': 2}, {'time': 2}, {'alike': 1}, {'torch': 1}, {'new': 7}, {'generation': 3, 'generations': 1}, {'americans': 4}, {'war': 4}, {'hard': 1}, {'bitter': 1}, {'peace

In [33]:
##Size of Statements required, So based on the size summary is given out...
if len(sent)>=5:
    size=5
else:
    size=5
F_S=Summary()
print(F_S.summarize(sent,chain_final,size))

file = open('Output for Extractive\summaryoutput.txt', 'w')
for s in F_S.summarize(sent,chain_final,size):
    file.write(s) 
file.close()

['Let the word go forth from this time and place, to friend and foe alike, that the torch has been passed to a new generation of Americans--born in this century, tempered by war, disciplined by a hard and bitter peace, proud of our ancient heritage--and unwilling to witness or permit the slow undoing of those human rights to which this nation has always been committed, and to which we are committed today at home and around the world.', 'Let every nation know, whether it wishes us well or ill, that we shall pay any price, bear any burden, meet any hardship, support any friend, oppose any foe to assure the survival and the success of liberty.', 'And if a beachhead of cooperation may push back the jungle of suspicion, let both sides join in creating a new endeavor, not a new balance of power, but a new world of law, where the strong are just and the weak secure and the peace preserved.', 'Now the trumpet summons us again--not as a call to bear arms, though arms we need--not as a call to b

In [8]:
#Rouge score Test for the Summarization
from rouge import FilesRouge
files_rouge = FilesRouge()
scores = files_rouge.get_scores(r'C:\Users\user\Desktop\Mtech Projects\Summarization Project Code\TRAILS\Output for Extractive\summaryoutput.txt',r'C:\Users\user\Desktop\Mtech Projects\Summarization Project Code\TRAILS\input.txt')
print("\n ",scores)


  [{'rouge-1': {'f': 0.33793969568408155, 'p': 1.0, 'r': 0.20332577475434618}, 'rouge-2': {'f': 0.33333333053046954, 'p': 0.9888059701492538, 'r': 0.2004538577912254}, 'rouge-l': {'f': 0.4441524275566592, 'p': 1.0, 'r': 0.28547297297297297}}]


In [9]:
#Bleu Score Test for the Summarization
from bleu import file_bleu
hyp_file=r'C:\Users\user\Desktop\Mtech Projects\Summarization Project Code\TRAILS\Output for Extractive\summaryoutput.txt'
ref_files=r'C:\Users\user\Desktop\Mtech Projects\Summarization Project Code\TRAILS\input.txt'
file_bleu(ref_files, hyp_file)

-1

In [88]:
data=[]
import glob
for list_o_file in glob.iglob('**/*.txt', recursive=True):
    print(list_o_file)
    data.append(list_o_file)
print(data)
data_len=len(data)
print(data_len)

001.txt
input.txt
['001.txt', 'input.txt']
2


In [82]:
"""list_o_file = glob.iglob('**/*.txt', recursive=True)
for file_name in list_o_file:
    print(file_name)
    
    f= open(file_name, 'r')
    lst = []
    for line in f:
       line.strip()
       line = line.replace("\n" ,'')
       line = line.replace("//" , '')
       lst.append(line)
    f.close()
    f=open(os.path.join('UpdatedTopics',os.path.basename(file_name)) , 'w')
    for line in lst:
       f.write(line)
    f.close()"""

001.txt
input.txt
UpdatedTopics\001.txt
UpdatedTopics\input.txt


In [97]:
##Main class
start1 = time.time()

if __name__=="__main__":
        
    ##I/P file
    for everyfile in data:
        start = time.time()
        ip=everyfile#join(dirname(abspath(getsourcefile(lambda:0))),"*.txt")
        with open(ip,"r",encoding="utf-8") as op:
            Input_text=op.read()
            op.close()
        print("The input File is:\n")
        print(Input_text)

        ##Provides the Nouns, Adverbs, Adjectives for the whole document.
        posit=['NN','NNS','NNP','NNPS','RB','JJ']
        sent=nltk.sent_tokenize(Input_text)
        tokenizer=RegexpTokenizer(r'\w+')
        tokens=[tokenizer.tokenize(w) for w in sent]
        tagged=[pos_tag(tok) for tok in tokens]
        nouns=[word.lower() for i in range(len(tagged)) for word, pos in tagged[i] if pos in posit]
        relation=relation_list(nouns)
        lexical=LC(nouns, relation)
        chain_final=Prune(lexical)

        ##Printing the LC...
        print("\n Printing the LC")
        for i in range(len(chain_final)):
            x="Chain"+str(i+1)+":"+str(chain_final[i])
            print("\n ",x)
            
            
        if len(sent)>=5:
            size=5
        else:
            size=5
        F_S=Summary()
        print("\n The Summarized data is:\n")
        print(F_S.summarize(sent,chain_final,size))
        print("_________________________________________________________________________________________________________________________")
        n=0
        if n<=len(file_name):
            file=open(os.path.join('Output for Extractive', os.path.basename(everyfile)) , 'w')
            #file = open('Output for Extractive\summaryoutput %d.txt',n, 'w')
            for s in F_S.summarize(sent,chain_final,size):
                file.write(s) 
            file.close()
        end = time.time()
        print(f"\n Runtime of the above Summary is : {end - start} seconds")
        print("___________________________________")
# total time taken
end1 = time.time()
print(f"Total Runtime of the program is : {end1 - start1} seconds")

The input File is:

Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and wil


 Relation List

  defaultdict(<class 'list'>, {'today': [['today', 'time', 'today', 'time_unit'], ['today', 'time', 'today', 'time_unit'], ['today', 'time', 'today', 'time_unit']], 'not': [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []], 'victory': [['victory', 'defeat', 'triumph', 'happening', 'happening']], 'party': [['party', 'political_party', 'social_group', 'party', 'gathering', 'party', 'company', 'social_group', 'party', 'social_event', 'party', 'causal_agent']], 'celebration': [['celebration', 'jubilation', 'social_event', 'celebration', 'festivity', 'activity', 'celebration', 'solemnization', 'solemnisation', 'practice']], 'freedom': [['freedom', 'attribute', 'exemption', 'freedom', 'condition'], ['freedom', 'attribute', 'exemption', 'freedom', 'condition'], ['freedom', 'attribute', 'exemption', 'freedom', 'condition'], ['freedom', 'attribute', 'exemption', 'freedom', 'condition']], 'end': [['end', 'terminal', 'region', 'end', 'beginning', 'ending


 Runtime of the above Summary is : 0.3380920886993408 seconds
___________________________________
Total Runtime of the program is : 0.4607679843902588 seconds


In [48]:
##Size of Statements required, So based on the size summary is given out...
for every in data:
    if len(sent)>=5:
        size=5
    else:
        size=5
    F_S=Summary()
    print(F_S.summarize(sent,chain_final,size))

    file = open('Output for Extractive\summaryoutput.txt', 'w')
    for s in F_S.summarize(sent,chain_final,size):
        file.write(s) 
    file.close()

AssertionError: 