In [1]:
from nltk.corpus import stopwords
#from nltk.cluster.util import cosine_distance
import numpy as np
import math
import networkx as nx   # FOR(PAGE RANK)
import re
import string

In [2]:
def readarticle(filename):
    file=open(filename,'r')
    filedata=file.read()
    article=filedata.split(".")
    sentences=[]
    for sentence in article:
        sentences.append(sentence.strip().replace("[^a-zA-Z]", " ").split(" "))
    return sentences

In [3]:
from nltk.stem import WordNetLemmatizer, LancasterStemmer
from nltk.corpus import stopwords
import re
def readarticle_lema(filename):
    file=open(filename,'r',encoding="utf-8")
    file_text=file.read()
    
    org_text = file_text
    org_text=re.sub("[()]",".",org_text)
    orginal = org_text.split(".")
    if len(orginal[-1])<5:
        orginal.pop(-1)
        
    
    file_text=re.sub("[()]",".",file_text)
    file_text=re.sub("[^a-zA-Z0-9.]"," ",file_text)
    file_text=re.sub('["]',' ',file_text)
    sents= file_text.split(".")
    if len(sents[-1])<5:
        sents.pop(-1)
    words=list()
    wordnet_lemmatizer = WordNetLemmatizer()
    for sent in sents:
        sent=sent.strip()
        lema_words=[wordnet_lemmatizer.lemmatize(word.lower(), pos="v") for word in sent.split()]
        words.append([word for word in lema_words if word not in list(stopwords.words('english'))])
    return orginal,words

def readarticle_stma(file_text):
    #file=open(filename,'r',encoding="utf-8")
    #file_text=file.read()
    
    org_text = file_text
    org_text=re.sub("[()]",".",org_text)
    orginal = org_text.split(".")
    if len(orginal[-1])<5:
        orginal.pop(-1)
    
    file_text=re.sub("[()]",".",file_text)
    file_text=re.sub("[^a-zA-Z0-9.]"," ",file_text)
    file_text=re.sub('["]',' ',file_text)
    sents= file_text.split(".")
    if len(sents[-1])<5:
        sents.pop(-1)
    words=list()
    lancaster=LancasterStemmer()
    for sent in sents:
        sent=sent.strip()
        stema_words=[lancaster.stem(word.lower()) for word in sent.split()]
        words.append([word for word in stema_words if word not in list(stopwords.words('english'))])
    return orginal,words

In [4]:
def get_uniquewords(sentences):
    stop_words=stopwords.words('english')
    unque=set()
    for sentence in sentences:
        for word in sentence:
            if word not in stop_words:
                unque.add(word)
    return sorted(list(unque))

# EXTRACTIVE METHODS

## 1 . TextRank (Page rank)

In [5]:
def cosine_similarity(A,B):
    denom =np.linalg.norm(A)*np.linalg.norm(B)
    if denom==0:
        return 0.7
    return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    return cosine_similarity(vector1, vector2)
    #return 1 - cosine_distance(vector1, vector2)

In [6]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        sent1=[i.lower() for i in sentences[idx1]]
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            sent2=[i.lower() for i in sentences[idx2]]
            similarity_matrix[idx1][idx2] = sentence_similarity(sent1, sent2, stop_words)

    return similarity_matrix

In [7]:
def TextRank_summary(file_name,ABSTRACT_SIZE=0.3):
    stop_words=stopwords.words('english')
    summary=[]
    act_sentences,sentences=readarticle_stma(file_name)
    similarityMatrix=build_similarity_matrix(sentences,stop_words)
    
    
    #ranking the sentences using Page Rank
    sentence_similarity_graph = nx.from_numpy_array(similarityMatrix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    sorted_scores=list(sorted(scores.items(),key= lambda item:item[1],reverse=True))[:max(1,math.floor(len(act_sentences)*ABSTRACT_SIZE))]
    
    return ".".join([act_sentences[i[0]] for i in sorted(sorted_scores[:])])


In [8]:
TextRank_summary("text.txt")

'In an ramraj attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills. Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services. This program also included developer-focused AI school that provided a bunch of assets to help build AI skills'

## 2 . Luhn's method (feature based )

In [9]:
def top_words(sentences):
    record = {}
    common_words =  stopwords.words('english')  #load_common_words()
    for sentence in sentences:
        words = sentence.split()
        for word in words:  #sentences is already a list of words so no need to split again. .
            w = word.strip('.!?,()\n').lower()
            record[w]= record.get(w,0)+1

    for word in record.keys():
        if word in common_words:
            record[word] = -1     
    occur = [key for key in record.keys()]
    occur.sort(reverse=True, key=lambda x: record[x])
    return set(occur[: len(occur) // 10 ])

In [10]:
def calculate_score(sentence, metric):
    words = sentence.split()
    imp_words, total_words, begin_unimp, end, begin = [0]*5
    for word in words:
        w = word.strip('.!?,();').lower()
        end += 1
        if w in metric:
            imp_words += 1
            begin = total_words
            end = 0
        total_words += 1
    unimportant = total_words - begin - end
    if(unimportant != 0):
        return float(imp_words**2) / float(unimportant)
    return 0.0

In [11]:
def Luhn_summary(file_name,ABSTRACT_SIZE=0.3):
    actual,sentences = readarticle_stma(file_name)
    sentences = [" ".join(sentence) for sentence in sentences] #to make words list to sentence
    metric = top_words(sentences)
    scores = {}
    for i,sentence in enumerate(sentences):
        scores[i]=calculate_score(sentence, metric)
    

    #sorting according to the ranking
    sorted_scores=list(sorted(scores.items(),key= lambda item:item[1],reverse=True))[:max(1,math.floor(len(actual)*ABSTRACT_SIZE))]
    
    return ".".join([actual[i[0]] for i in sorted(sorted_scores)])


In [12]:
Luhn_summary("text.txt")

'In an ramraj attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills. Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services. The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning'

## 3. LSA ( cross )

In [13]:
def modified_tfidf(sentences , unique_words,modified=True):
    tf_idf= np.zeros((len(sentences),len(unique_words)))
    tot_frequency=dict()
    i=0
    # frequency matrix or TF values
    for sentence in sentences:
        for word in sentence :
            if word in unique_words:
                j =unique_words.index(word)
                freq = tf_idf[i][j]
                if freq==0 :
                    tot_frequency[word]=tot_frequency.get(word,0)+1
                tf_idf[i][j]=freq+1
        i=i+1
    #print(tot_frequency)
    #binary=tf_idf
    # calculating IDF values for all the unique values
    x,y = tf_idf.shape
    idf={}
    for i in tot_frequency.keys():
        idf[i]=math.log(x/tot_frequency[i])
    #print(idf)
    # calculating tf_idf values 
    for i in range(x):
        for j in range(y):
            tf_idf[i][j] = tf_idf[i][j]*idf[unique_words[j]]
    if modified:
        # modified Tf_IDF approch making the less than average values to zero to remove noise
        sent_avg = np.mean(tf_idf,axis=1)
        #print("average= ",sent_avg)
        res=[]
        for i in range(x):
            res.append(list(np.greater(tf_idf[i],sent_avg[i]).astype("int")))
        #print(np.count_nonzero(tf_idf==0) , np.count_nonzero((res*tf_idf)==0))
        return res*tf_idf
    else:
        return tf_idf

In [16]:
def LSA_summary(filename,ABSTRACT_SIZE=0.3):
    orginal,sentences = readarticle_stma(filename)
    uniqueWords=get_uniquewords(sentences)
    tf_idf_vectors=modified_tfidf(sentences,uniqueWords)
    #print(len(sentences), len(uniqueWords), tf_idf_vectors.shape)
    U,s,V = np.linalg.svd(np.transpose(tf_idf_vectors))
    V_avg=np.mean(V,axis=1)
    #print("avg= " ,V_avg)
    
    # redusing the noices again 
    res=[]
    for i in range(len(V_avg)):
        res.append(list(np.greater(V[i],V_avg[i]).astype("int")))
    V= V*res
    
    # geting the sentence length values
    Lengths = np.sum(V,axis=0)
    scores = {i:j for i,j in enumerate(Lengths)}

    #sorting according to the ranking
    sorted_scores=list(sorted(scores.items(),key= lambda item:item[1],reverse=True))[:max(1,math.floor(len(orginal)*ABSTRACT_SIZE))]
    #print(sorted_scores)
    # Selecting the top sentences
    
    return ".".join([orginal[i[0]] for i in sorted(sorted_scores)])

In [17]:
LSA_summary("text.txt")


' The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning. The program is an attempt to ramp up the institutional set-up and build capabilities among the educators to educate the workforce of tomorrow." The program aims to build up the cognitive skills and in-depth understanding of developing intelligent cloud connected solutions for applications across industry'

## 4 . Fuzzi logic

In [18]:
from fuzzyLogic.summerize import fuzzy_summary

In [19]:
fuzzy_summary("text.txt")

before feature calculation
feature 1 done
feature 2 done
feature 3 done
feature 4 done
feature 5 done
feature 6 done


ZeroDivisionError: division by zero

In [3]:
import nltk

In [4]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lokanadh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [6]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lokanadh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

## 5. Sentence Embidings (ref)

In [1]:
### DATA SET AMAZON REVIEWS DATA SET (OPTIONAL)
### PRE TRIENDED 50 DIMENSSIONS EMBIDINGS(GLOVE)
### k-means clustering and selection based on the distance to the center of the cluster.

In [20]:
import numpy as np
import re
import string

In [5]:
def loadEmbeddingMatrix(EMBEDDING_FILE):
    embeddings_index = dict()
    #Transfer the embedding weights i dictionary by iterating through every line of the file.
    f = open(EMBEDDING_FILE,'r',encoding='utf-8')
    for line in f:
        #split up line into an indexed array
        values = line.split()
        #first index is word
        word = values[0]
        #store the rest of the values in the array as a new array
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs #50 dimensions
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))

    return embeddings_index #, embedding_matrix

## Loading 'glove' words
emb_index= loadEmbeddingMatrix('C:/Users/lokanadh/Desktop/Anaconda/text summerization/glove.6B.50d.txt')

Loaded 400000 word vectors.


In [8]:
def get_sent_embedding(wordlist):
    """
    This function calculates the embedding of each sentence in the review. Checks if the sentence being passed is a valid one, 
    removing the punctuation and emojis etc.
    """
    sent_emb = []
    for i in wordlist:
        i = i.lower()
        try :
            res=list(emb_index[i])
            print(i,sep="    ",end="")
        except:
            res=list(emb_index['unknown'])
            print('""'+i+'""',end="    ")
        print(res,end="\n\n")
        sent_emb.append(res)

    #calculating the mean 
    
    #for empty senetnces
    if len(sent_emb)<1:
        return np.zeros(50)
    sent_emb=np.mean(sent_emb,axis=0)
    return np.array(sent_emb)

In [9]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

def Embeding_summery(file_name):
    actual , sentences =readarticle_lema(file_name)
    emb_sents=[get_sent_embedding(sent) for sent in sentences]
    sentences=[" ".join(sent) for sent in sentences]
    n_clusters = int(np.ceil(len(emb_sents)**0.5))
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(emb_sents)
    avg = []
    closest = []
    for j in range(n_clusters):
        idx = np.where(kmeans.labels_ == j)[0]
        #print("IDX is: ", idx)
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,emb_sents)
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])
    summary = ' '.join([actual[closest[idx]] for idx in ordering])
    return summary

Embeding_summery("text.txt")

""ramraj""    [0.89855, 0.30093, 0.38384, -0.07748, 1.2406, 0.6338, -0.49759, 0.59377, -0.16398, -0.079284, 0.6614, -0.17841, 0.064431, 0.15498, 0.63783, -0.12535, -0.045814, 0.084162, -0.84272, 0.25469, -0.53641, 0.058337, 0.53229, 0.60801, 0.41529, -1.2192, -1.1077, -0.29251, 0.50284, 0.65703, 2.2331, -1.2356, 0.18461, -1.1709, 0.56209, 0.3741, 0.24536, -0.21032, -0.35088, 0.20336, 0.098822, -0.15596, 0.088795, 0.17909, 0.21729, -0.50994, -0.48693, -0.07791, 0.55245, -0.62789]

attempt[0.5323, -0.70044, 0.37422, -0.31529, 0.012028, 0.50933, -0.23546, 0.72083, -0.1387, 0.5066, -0.38464, 0.12579, -0.68992, 0.38425, -0.1367, 0.083163, 0.62144, -1.0586, -0.75507, -0.0057474, 0.53155, 0.097558, -0.41206, -0.40328, 0.31674, -2.2257, -0.043076, -0.83749, 0.93522, -0.21468, 2.4972, 0.2487, -1.7374, -0.48367, -0.051459, 0.2463, -0.25872, -0.20222, -0.43626, -0.4603, -0.11058, -0.019738, 0.089743, -0.35528, 0.15794, -0.24601, 0.30675, 0.02953, 0.63595, -0.59635]

build[1.2426, 0.56913, 1.0025,

' The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning  This will require more collaborations and training and working with AI  Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transforming lives and industry and the jobs of tomorrow will require a different skillset'

## 6. Own method (based on paper)

In [24]:
import numpy as np
import math

In [25]:
def SVD_elimenation_clusterWise(tf_idf_vectors,indx,elimiate=False):
    U,s,V = np.linalg.svd(np.transpose(tf_idf_vectors))
    V_avg=np.mean(V,axis=1)
    #print("avg= " ,V_avg)
    
    # redusing the noices again 
    res=[]
    for i in range(len(V_avg)):
        res.append(list(np.greater(V[i],V_avg[i]).astype("int")))
    if elimiate:
        V= V*res
    
    # geting the sentence length values
    Lengths = np.sum(V,axis=0)
    return {i:j for i,j in zip(indx,Lengths)}

In [26]:
def get_abstract_ratio(counts,ABSTRACT_SIZE):
    f_c=[]
    c=[]
    for _,i in counts.items():
        f_c.append(math.floor(i*ABSTRACT_SIZE))
        c.append(float(i)*ABSTRACT_SIZE)
    return f_c,c

In [27]:
def remove_bottom_30(selected,scores):
    scores = sorted(scores.items(),key=lambda item:item[1])[:round(len(selected)*0.40)]
    for i in scores:
        selected[i[0]]=-1
    return selected

In [28]:
def get_sent_embedding_tfidf(wordlist,tf_idf,unique_words):
    """
    This function calculates the embedding of each sentence in the review. Checks if the sentence being passed is a valid one, 
    removing the punctuation and emojis etc.
    """
    sent_emb = []
    for i in wordlist:
        i = i.lower()
        try :
            res=list(emb_index[i])
        except:
            res=list(emb_index['unknown'])
            
        sent_emb.append(np.array(res)*tf_idf[unique_words.index(i)])

    # multiplying with the tf_idf values
    #calculating the mean 
    sent_emb=np.mean(sent_emb,axis=0)
    return np.array(sent_emb)

In [29]:
def select_sentences_high(selected,scores,ratios):
    '''
    need to the cluster preference order based on the average sentence value.
    '''
    sele=0
    for i in range(len(ratios)):
        j=0
        #print("cluster []",i)
        sorted_ist = sorted(scores[i].items(),key=lambda item:item[1],reverse=True)
        while j<ratios[i]:
            top = sorted_ist.pop(0)
            index = top[0]
            if selected[index]==0:
                sele=sele+1
                selected[index]=1
                j=j+1
    return selected,sele
        
        

In [30]:
def select_sentences_low(selected , scores ,get_ratio):
    min_cover = [float(math.ceil(i))-i for i in get_ratio]
    min_index =min_cover.index(min(min_cover))
    j=0
    sorted_list=sorted(scores[min_index].items(),key=lambda item:item[1],reverse=True)
    while j<1:
        top = sorted_list.pop(0)
        index = top[0]
        if selected[index]==0:
            selected[index]=1
            j=j+1
    return selected,min_index

In [71]:
def OWN_summary(filename,ABSTRACT_SIZE=0.3):
    actual,sentences = readarticle_lema(filename)
    uniqueWords=get_uniquewords(sentences)
    tf_idf = modified_tfidf(sentences,uniqueWords,modified=True)
    print("loaded text and converted to matrix: done")
    # sentence wise normalizationof tf_idf values
    res=[]
    for i in tf_idf:
        denom=max(i)-min(i)
        if denom==0:
            res.append(i)
        else:
            res.append((i-min(i))/(max(i)-min(i)))
    res = np.array(res)+1
    print("done1")
    # getting the sentence embedding 
    emb_sents = [get_sent_embedding_tfidf(sentences[i],res[i],uniqueWords) for i in range(len(sentences))]
    
    # clustering the sentences
    print("started clustering  :",end=" ")
    n_clusters = int(np.ceil(len(emb_sents)**0.5))
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(emb_sents)
    print("done")
    
    print("applying SVD alogo :",end=" ")
    # implementation of the SvD method to give attention to the required sentences in each clusters.
    tf_idf_cluster=np.array(tf_idf)
    scores_cluster = {}
    counts={}
    for i in range(n_clusters):
        idx = list(np.where(kmeans.labels_ == i)[0])
        scores_cluster[i]=SVD_elimenation_clusterWise(tf_idf_cluster[idx],idx,elimiate=True)
        counts[i]=len(idx)
    # implementation of the SvD method to give attention on global level.
    scores_global = SVD_elimenation_clusterWise(tf_idf_cluster,[i for i in range(len(actual))])
    print("done")
    
    selected=[0 for i in range(len(actual))]
    
    # the bellow process eliminates bottom 30% of text before the selection step.
    selected = remove_bottom_30(selected,scores_global)
    #print(selected)
    print("removed bottom 40 percentel using globel level : done")
    # get top ABSTRACT_SIZE percent sente form each cluster
    
    print("constructing summary from top :",end=" ")
    
    get_ratio_f,get_ratio= get_abstract_ratio(counts,ABSTRACT_SIZE)
    
    to_be_selected=math.floor(len(actual)*ABSTRACT_SIZE)
    selected,cur_selected = select_sentences_high(selected,scores_cluster,get_ratio_f)
    print("done")
    while(cur_selected<to_be_selected):
        print("....calling function to rescue")
        selected , cl= select_sentences_low(selected , scores_cluster ,get_ratio)
        get_ratio[cl]=math.ceil(get_ratio[cl])
        cur_selected=cur_selected+1
    print("\n\n",selected,"\n")
    
    summary=[]
    for i in range(len(selected)):
        if selected[i]==1:
            summary.append(actual[i])
    return ".".join(summary)

In [32]:
dic = OWN_summary("text.txt")
print(dic)

loaded text and converted to matrix: done
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [-1, 0, 1, -1, 0, -1, 0, 1, -1, 0, 1, -1] 

 As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses. The program is an attempt to ramp up the institutional set-up and build capabilities among the educators to educate the workforce of tomorrow. The program was developed to provide job ready skills to programmers who wanted to hone their skills in AI and data science with a series of online courses which featured hands-on labs and expert instructors as well


## TEST AREA

In [35]:
from bs4 import BeautifulSoup
# from urllib.request import urlopen
from urllib.request import urlopen
def get_text(url):
	page = urlopen(url)
	soup = BeautifulSoup(page)
	fetched_text = ' '.join(map(lambda p:p.text,soup.find_all('p')))
	return fetched_text

In [54]:
text = get_text('https://en.wikipedia.org/wiki/Mahesh_Babu')
print("done")
act,sentences=readarticle_stma(text)
uniq = get_uniquewords(sentences)

done


In [75]:
res=np.array([[1.,2.3,4.32,3.4]])
np.mean([[0,0,0,0,0]],axis=0)

array([0., 0., 0., 0., 0.])

In [77]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

# EVALUATION

In [None]:
!pip install rouge-score

In [1]:
import text_summerizer as TS

In [2]:
import pandas as pd
df = pd.read_csv("cleaned_CNN.csv")
df

Unnamed: 0,articles,summaries
0,it is official american . president barack o...,syrian official obama climbed to the top of t...
1,cnn usain bolt rounded off the world cham...,usain bolt wins third gold of world championsh...
2,"kansas city, missouri cnn the general ser...",the employee in agencys kansas city office is ...
3,los angeles cnn a medical doctor in vanco...,new a canadian doctor says she was part of a ...
4,cnn police arrested another teen thursday...,another arrest made in gang rape outside calif...
...,...,...
195,cnn arsenal came back from two goals down...,arsenal came back from two goals down to claim...
196,cnn getting hired or promoted in todays c...,"every company has an emotional motivation, or ..."
197,when harry returned home the morning after the...,residents of tacloban returning to home to che...
198,sydney cnn indian prime minister narendra...,"australia returns lost art treasures to india,..."


In [8]:
import text_summerizer as TS
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True)

import pandas as pd
df = pd.read_csv("cleaned_CNN.csv")
raw_text_list = df['articles'].values
sum_text_list = df['summaries'].values

models= ["TextRank","Luhn","LSA","Embeding","own"]
scores_methods=["rouge"]
rouge_scores=dict()


for model in models:
    print("-"*30+model+"-"*30)
    pred_text_list = list()
    article =list()
    scores=list()
    for art,summary_orginal in zip(list(raw_text_list),list(sum_text_list)):
        try:
            summ = TS.summerize(model,art,0.05)
        except:
            continue
        article.append(art)
        pred_text_list.append(summ)
        print("#",end="")
        scores.append(scorer.score(summary_orginal , summ))
    rouge_scores[model]=scores
    print(scores)
print("\n\n")
print("-"*50)
print(rouge_scores)

------------------------------TextRank------------------------------
#######################################################################################################################################################################################################[{'rouge1': Score(precision=0.4, recall=0.037037037037037035, fmeasure=0.06779661016949153), 'rougeL': Score(precision=0.2, recall=0.018518518518518517, fmeasure=0.03389830508474576)}, {'rouge1': Score(precision=0.4, recall=0.06666666666666667, fmeasure=0.1142857142857143), 'rougeL': Score(precision=0.4, recall=0.06666666666666667, fmeasure=0.1142857142857143)}, {'rouge1': Score(precision=0.26153846153846155, recall=0.4594594594594595, fmeasure=0.3333333333333333), 'rougeL': Score(precision=0.13846153846153847, recall=0.24324324324324326, fmeasure=0.17647058823529413)}, {'rouge1': Score(precision=0.2328767123287671, recall=0.34, fmeasure=0.2764227642276423), 'rougeL': Score(precision=0.1095890410958904, recall=0.16, fmeasu

#######################################################################################################################################################################################################[{'rouge1': Score(precision=0.1746031746031746, recall=0.6111111111111112, fmeasure=0.2716049382716049), 'rougeL': Score(precision=0.10582010582010581, recall=0.37037037037037035, fmeasure=0.16460905349794236)}, {'rouge1': Score(precision=0.10256410256410256, recall=0.13333333333333333, fmeasure=0.11594202898550725), 'rougeL': Score(precision=0.05128205128205128, recall=0.06666666666666667, fmeasure=0.057971014492753624)}, {'rouge1': Score(precision=0.373134328358209, recall=0.6756756756756757, fmeasure=0.4807692307692308), 'rougeL': Score(precision=0.22388059701492538, recall=0.40540540540540543, fmeasure=0.28846153846153844)}, {'rouge1': Score(precision=0.2328767123287671, recall=0.34, fmeasure=0.2764227642276423), 'rougeL': Score(precision=0.1095890410958904, recall=0.16, fmeasure=0.1300

#######################################################################################################################################################################################################[{'rouge1': Score(precision=0.23809523809523808, recall=0.37037037037037035, fmeasure=0.2898550724637681), 'rougeL': Score(precision=0.10714285714285714, recall=0.16666666666666666, fmeasure=0.13043478260869565)}, {'rouge1': Score(precision=0.3333333333333333, recall=0.2, fmeasure=0.25), 'rougeL': Score(precision=0.2777777777777778, recall=0.16666666666666666, fmeasure=0.20833333333333334)}, {'rouge1': Score(precision=0.2564102564102564, recall=0.2702702702702703, fmeasure=0.2631578947368421), 'rougeL': Score(precision=0.1794871794871795, recall=0.1891891891891892, fmeasure=0.1842105263157895)}, {'rouge1': Score(precision=0.2708333333333333, recall=0.26, fmeasure=0.2653061224489796), 'rougeL': Score(precision=0.125, recall=0.12, fmeasure=0.12244897959183673)}, {'rouge1': Score(precision=0.2

[70 33 57 13  7 56 48  6 67 79]
#[10  2  7 12 11]
#[21 27 41 36 28 18 46]
#[40 14 10 29  0 22  1]
#[ 1 14  2  0]
#[16 13 21 12 28 10 30]
#[30 17 49 57  1 50 56 35 18]
#[10 18 32 41 22 14 35]
#[14  1  9  8 16]
#[10  4  2  8  6]
#[17 23 19 20  8]
#[9 0 5 4 8]
#[18 19 33 39  5  4  2]
#[ 8 12 18  6 17  9]
#[ 3 49 56 76  2 25 73 37 43 77]
#[59 46 35 22 43 52 23 11]
#[ 0 15 20 18 33 35 40]
#[ 2 10  7 12 16]
#[38 22 33 12  3 50 29  9]
#[ 0  2  1 15  8]
#[4 1 2]
#[14 11  2  5]
#[24  8  3 10 49 14 12  4]
#[ 4  0  6  5 24 13]
#[33 27  6 14 10 42 47]
#[11  0  7 16 30  8]
#[7 1 0 9]
#[ 4 36 37  1 17 22 12]
#[ 2 36  3 21  5 13  6]
#[16  6 38 26 43 20 24 53]
#[ 0  1 27 18 21  5]
#[ 8 12  6  5]
#[22 23 48 61 30 12 59 47 27]
#[ 7  1  4 10]
#[ 9  6 13 17 14]
#[ 4  1  3 10]
#[ 2 23 21 20 22]
#[35  6 47 32 22 24  7 33]
#[13  6  1  2  7]
#[15 39 50 11 44  8 25 22 21]
#[61 54 44 35 57 32 13 62 63]
#[18 14  1  2 17  4]
#[40  3 41 44  6 13 35]
#[10  5  6 30 24 35 28]
#[ 9  4 35 18 14  7]
#[ 4 16 26 34 19 14 

loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [0, 0, -1, 0, 0, 0, -1, 0, -1, -1, 0, -1, 0, 0, -1, -1, 1, 0, -1, 0, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, 0, 0, -1, -1, 0, -1, -1, 0, -1, -1, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 1, 1, 1, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, -1, -1, -1, -1, -1, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [0, 0, -1, 0, 1, -1, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, -1, 0, -1, -1, -1, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summa

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [0, 0, -1, -1, -1, 0, -1, 0, 0, 0, -1, 1, 0, 0, -1, 0, -1, -1, 0, -1, 0, 0, 0, 0, 0, -1] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue
....calling function to rescue


 [-1, 0, -1, 0, -1, -1, 0, 1, -1, -1, 0, 0, -1, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, -1, 0, -1, -1, -1, 1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [0, 0, 1, 0, 0, -1, 0, 0, 0, -1

done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done


 [-1, 0, -1, 0, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [-1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 1, -1, -1, -1, 0, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done


 [0, -1, 0, -1, 0, 0, 0, 0, -1, 0, 0, -1, -1, 0, -1, 0, -1, 0, 1, 0, -1, 0, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, -1, 0, 0, 0, -1, 0, -1, -1, 1, -1, -1, 0, 0, 0, 0, -1, 0, 0, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel usi

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [0, -1, 0, 0, 0, -1, -1, 0, 0, 1, -1, -1, 0, 0, -1, -1, 0, -1, 0, -1, 0, -1, -1, 0, -1, -1, 0, 0, 0, -1, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, -1] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue
....calling function to rescue


 [0, 0, 0, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, 1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, 0, 1, 0, -1] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done


 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue
....calling function to rescue
....calling function to rescue


 [1, -1, 0, 0, 0, -1, -1, 0, 0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 0, -1, 0, 0, -1, 0, -1, -1, 1, -1, -1, -1, -1, 0, 0, 0, 0, -1, 0, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, 0, 1, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [0, 0, -1, -1, -1, 1, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, -1, 0, -1, -1, 0, 0, 0, -1, 0, 0, -1] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue
....calling function to rescue


 [0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, 0, -1, 0, -1, -1, 0, 0, -1, 0, 0, -1, -1, 0, 1, -1, -1, 0, 0, 0, -1, -1, -1, 0, -1, 0, -1, 0, -1, 1, -1, 0, 0, 1, -1, 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, -1, -1, -1, 0, -1, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, -1, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue
....calling function to rescue


 [-1, -1, 0, 0, 0, -1, -1, 1, 1, 0, 0, 0, 0, 1, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -1, 0, 0, 0, 0, -1, -1, 0, 0, -1, 0, 0, -1, -1, 0, 0, -1, 0, -1, -1, 0, -1, -1, -1, 0, 0, -1, 0, 0, 0, -1, 0, -1,

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue
....calling function to rescue


 [-1, 0, 0, -1, 0, 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 1, -1, 0, -1, 1, 0, -1, -1, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, -1, 0, -1, -1, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done


 [0, -1, 0, 0, 0, -1, 0, 0, -1, -1, 0, -1, 0, -1] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue
....calling function to rescue


 [-1, 0, -1, 0, -1, 0, 0, -1, 1, -1, 0, 0, -1, 0, 1, 0, -1, -1, -1, 0, -1, -1, 0, -1, -1, 0,

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue
....calling function to rescue


 [0, -1, 0, 0, 0, -1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -1, 0, -1, -1, 0, 0, 1, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, -1, -1, -1, -1, 0, -1, -1, -1, -1, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, -1, 0, -1, -1, 0, -1, -1, 0, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done


 [0, -1, -1, 0, 0, 0, 0, 0, -1, 0, -1, -1, 0, -1, 1, 0, 0, -1, -1, 0, -1, -1, 0, 0, -1, 0, 0, -1, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....cal

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [0, -1, -1, 0, -1, 0, -1, 0, 0, 0, 0, -1, -1, 1, 0, -1, 0, 0, -1, 0, 0, -1, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done


 [0, -1, 0, 0, -1, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done


 [-1, 0, 0, -1, 0] 

#loaded text and converted to matrix: done
done1
started clustering  : done
applying SVD alogo : done
removed bottom 40 percentel using globel level : done
constructing summary from top : done
....calling function to rescue


 [0, -1, 0, 0, 0, 0, -1, 0, 0, 

{'TextRank': [{'rouge1': Score(precision=0.4, recall=0.037037037037037035, fmeasure=0.06779661016949153), 'rougeL': Score(precision=0.2, recall=0.018518518518518517, fmeasure=0.03389830508474576)}, {'rouge1': Score(precision=0.4, recall=0.06666666666666667, fmeasure=0.1142857142857143), 'rougeL': Score(precision=0.4, recall=0.06666666666666667, fmeasure=0.1142857142857143)}, {'rouge1': Score(precision=0.26153846153846155, recall=0.4594594594594595, fmeasure=0.3333333333333333), 'rougeL': Score(precision=0.13846153846153847, recall=0.24324324324324326, fmeasure=0.17647058823529413)}, {'rouge1': Score(precision=0.2328767123287671, recall=0.34, fmeasure=0.2764227642276423), 'rougeL': Score(precision=0.1095890410958904, recall=0.16, fmeasure=0.13008130081300812)}, {'rouge1': Score(precision=0.16, recall=0.125, fmeasure=0.14035087719298245), 'rougeL': Score(precision=0.12, recall=0.09375, fmeasure=0.10526315789473684)}, {'rouge1': Score(precision=0.38095238095238093, recall=0.22222222222222

In [19]:
import numpy as np
models= ["TextRank","Luhn","LSA","Embeding","own"]
S=dict()
for model in models:
    scores = rouge_scores[model]
    p=[]
    r=[]
    f=[]
    pp=[]
    rr=[]
    ff=[]
    for i in scores:
        p.append(i['rouge1'].precision)
        r.append(i['rouge1'].recall)
        f.append(i['rouge1'].fmeasure)
        pp.append(i['rougeL'].precision)
        rr.append(i['rougeL'].recall)
        ff.append(i['rougeL'].fmeasure)
    s = dict({'rouge1':dict({'precision':np.mean(p),'recall':np.mean(r),'fmeasure':np.mean(f)}),'rougeL':dict({'precision':np.mean(pp),'recall':np.mean(rr),'fmeasure':np.mean(ff)}) })
    print(s)
    S[model]=s

{'rouge1': {'precision': 0.25252229881026844, 'recall': 0.17264653671825525, 'fmeasure': 0.1807700739563423}, 'rougeL': {'precision': 0.18488133081715283, 'recall': 0.1163089895698723, 'fmeasure': 0.12345401083991904}}
{'rouge1': {'precision': 0.28116832566513217, 'recall': 0.3173661259657755, 'fmeasure': 0.2756544964630877}, 'rougeL': {'precision': 0.19286133226766208, 'recall': 0.20998488704659168, 'fmeasure': 0.1859131126811662}}
{'rouge1': {'precision': 0.25985242787210033, 'recall': 0.1998935656390496, 'fmeasure': 0.21302090142663976}, 'rougeL': {'precision': 0.18012239187663273, 'recall': 0.13134089123333623, 'fmeasure': 0.14295502777404942}}
{'rouge1': {'precision': 0.2652218380712955, 'recall': 0.22554096204383503, 'fmeasure': 0.2185919884985633}, 'rougeL': {'precision': 0.1852411805352614, 'recall': 0.14717810959119393, 'fmeasure': 0.14445993748279387}}
{'rouge1': {'precision': 0.1891288028554405, 'recall': 0.14643175045379245, 'fmeasure': 0.1540138656666797}, 'rougeL': {'prec

In [20]:
S

{'TextRank': {'rouge1': {'precision': 0.25252229881026844,
   'recall': 0.17264653671825525,
   'fmeasure': 0.1807700739563423},
  'rougeL': {'precision': 0.18488133081715283,
   'recall': 0.1163089895698723,
   'fmeasure': 0.12345401083991904}},
 'Luhn': {'rouge1': {'precision': 0.28116832566513217,
   'recall': 0.3173661259657755,
   'fmeasure': 0.2756544964630877},
  'rougeL': {'precision': 0.19286133226766208,
   'recall': 0.20998488704659168,
   'fmeasure': 0.1859131126811662}},
 'LSA': {'rouge1': {'precision': 0.25985242787210033,
   'recall': 0.1998935656390496,
   'fmeasure': 0.21302090142663976},
  'rougeL': {'precision': 0.18012239187663273,
   'recall': 0.13134089123333623,
   'fmeasure': 0.14295502777404942}},
 'Embeding': {'rouge1': {'precision': 0.2652218380712955,
   'recall': 0.22554096204383503,
   'fmeasure': 0.2185919884985633},
  'rougeL': {'precision': 0.1852411805352614,
   'recall': 0.14717810959119393,
   'fmeasure': 0.14445993748279387}},
 'own': {'rouge1': {'p

## individual process

In [7]:
raw_text_list = df['articles'].values
line= '____________________________________________________________________________________________________'
i=1
pred_summ= list()
article =list()
for art in list(raw_text_list):
    try:
        summ = TS.Luhn_summary(art,ABSTRACT_SIZE=0.05)
    except:
        summ = "###"
        continue
    pred_summ.append(summ)
    print(i,": success")
    i=i+1

1 : success
2 : success
3 : success
4 : success
5 : success
6 : success
7 : success
8 : success
9 : success
10 : success
11 : success
12 : success
13 : success
14 : success
15 : success
16 : success
17 : success
18 : success
19 : success
20 : success
21 : success
22 : success
23 : success
24 : success
25 : success
26 : success
27 : success
28 : success
29 : success
30 : success
31 : success
32 : success
33 : success
34 : success
35 : success
36 : success
37 : success
38 : success
39 : success
40 : success
41 : success
42 : success
43 : success
44 : success
45 : success
46 : success
47 : success
48 : success
49 : success
50 : success
51 : success
52 : success
53 : success
54 : success
55 : success
56 : success
57 : success
58 : success
59 : success
60 : success
61 : success
62 : success
63 : success
64 : success
65 : success
66 : success
67 : success
68 : success
69 : success
70 : success
71 : success
72 : success
73 : success
74 : success
75 : success
76 : success
77 : success
78 : suc

In [73]:
Embeding_df=pd.DataFrame({'actual':df['summaries'],'predicted':pred_summ})
Embeding_df.to_csv("res\Embeding_results.csv",index=False)

In [3]:
raw_text_list = df['articles'].values
print(raw_text_list[0])
TS.Embeding_summary(raw_text_list[0],0.05)

it is  official  american . president barack obama wants lawmakers to weigh in on whether to use military force in syria.  obama sent a letter to the heads of the house and senate on saturday night, hours after announcing that he believes military action against syrian targets is the right step to take over the alleged use of chemical weapons.  the proposed legislation from obama asks congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction."  it is  a step that is set to turn an international crisis into a fierce domestic political battle.  there are key questions looming over the debate  what did united nations . weapons inspectors find in syria  what happens if congress votes no  and how will the syrian government react   in a televised address from the white house rose garden earlier saturday, the president said he would take his case to congress, not because he has t

[70 33 57 13  7 56 48  6 67 79]


'n   house speaker john boehner, majority leader eric cantor, majority whip kevin mccarthy and conference chair cathy mcmorris rodgers issued a statement saturday praising the president  weapons inspectors find in syria  what happens if congress votes no  and how will the syrian government react   in a televised address from the white house rose garden earlier saturday, the president said he would take his case to congress, not because he has to    but because he wants to  some american '

In [74]:
from rouge_score import rouge_scorer
scores=list()
scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'], use_stemmer=True)
for i,j in zip(Embeding_df['actual'].values,Embeding_df['predicted'].values):
    if len(j)>10:
        scores.append(scorer.score(i, j ))

In [75]:
p=[]
r=[]
f=[]
pp=[]
rr=[]
ff=[]
for i in scores:
    p.append(i['rouge1'].precision)
    r.append(i['rouge1'].recall)
    f.append(i['rouge1'].fmeasure)
    pp.append(i['rougeL'].precision)
    rr.append(i['rougeL'].recall)
    ff.append(i['rougeL'].fmeasure)   

In [76]:
import numpy as np
print("rouge1 = ",np.mean(p),np.mean(r),np.mean(f))
print("rougeL = ",np.mean(pp),np.mean(rr),np.mean(ff))

rouge1 =  nan nan nan
rougeL =  nan nan nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
