In [1]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import math
#import networkx as nx   # FOR(PAGE RANK)


In [2]:

import networkx as nx 

In [2]:
def readarticle(filename):
    file=open(filename,'r')
    filedata=file.read()
    article=filedata.split(".")
    sentences=[]
    for sentence in article:
        sentences.append(sentence.strip().replace("[^a-zA-Z]", " ").split(" "))
    return sentences

In [9]:
from nltk.stem import WordNetLemmatizer, LancasterStemmer
from nltk.corpus import stopwords
import re
def readarticle_lema(filename):
    file=open(filename,'r',encoding="utf-8")
    file_text=file.read()
    file_text=re.sub("[()]",".",file_text)
    file_text=re.sub("[^a-zA-Z0-9.]"," ",file_text)
    file_text=re.sub('["]',' ',file_text)
    sents= file_text.split(".")
    if len(sents[-1])<5:
        sents.pop(-1)
    words=list()
    wordnet_lemmatizer = WordNetLemmatizer()
    for sent in sents:
        sent=sent.strip()
        words.append([wordnet_lemmatizer.lemmatize(word.lower(), pos="v") for word in sent.split() if word.lower() not in list(stopwords.words('english'))])
    return words
def readarticle_stma(filename):
    file=open(filename,'r',encoding="utf-8")
    file_text=file.read()
    file_text=re.sub("[()]",".",file_text)
    file_text=re.sub("[^a-zA-Z0-9.]"," ",file_text)
    file_text=re.sub('["]',' ',file_text)
    sents= file_text.split(".")
    if len(sents[-1])<5:
        sents.pop(-1)
    words=list()
    lancaster=LancasterStemmer()
    for sent in sents:
        sent=sent.strip()
        words.append([lancaster.stem(word.lower()) for word in sent.split() if word.lower() not in list(stopwords.words('english'))])
    return words

In [4]:
def get_uniquewords(sentences):
    stop_words=stopwords.words('english')
    unque=set()
    for sentence in sentences:
        for word in sentence:
            if word not in stop_words:
                unque.add(word)
    return sorted(list(unque))

# EXTRACTIVE METHODS

## 1 . TextRank (Page rank)

In [5]:
def cosine_similarity(A,B):
    return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    return cosine_similarity(vector1, vector2)
    #return 1 - cosine_distance(vector1, vector2)

In [6]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        sent1=[i.lower() for i in sentences[idx1]]
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            sent2=[i.lower() for i in sentences[idx2]]
            similarity_matrix[idx1][idx2] = sentence_similarity(sent1, sent2, stop_words)

    return similarity_matrix

In [7]:
def TextRank_summary(file_name,ABSTRACT_SIZE=0.3):
    stop_words=stopwords.words('english')
    summary=[]
    act_sentences,sentences=readarticle_stma(file_name)
    similarityMatrix=build_similarity_matrix(sentences,stop_words)
    
    
    #ranking the sentences using Page Rank
    sentence_similarity_graph = nx.from_numpy_array(similarityMatrix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    #sorting according to the ranking
    ranked_sentence = sorted(((scores[i]," ".join(s)) for i,s in enumerate(sentences)), reverse=True)

    
    return ".".join([i[1] for i in ranked_sentence[:round(len(sentences)*ABSTRACT_SIZE)]])


In [10]:
TextRank_summary("text.txt")

'program also includ develop focus ai school provid bunch asset help build ai skil.envid three year collab program intellig cloud hub support around 100 institut ai infrastruct cours cont curricul develop support develop tool giv stud access cloud ai serv.attempt build ai ready workforc microsoft annount intellig cloud hub launch empow next gen stud ai ready skil.program develop provid job ready skil program want hon skil ai dat sci sery onlin cours feat hand lab expert instruct wel'

## 2 . Luhn's method (feature based )

In [7]:
def top_words(sentences):
    record = {}
    common_words =  stopwords.words('english')  #load_common_words()
    for sentence in sentences:
        words = sentence.split()
        for word in words:  #sentences is already a list of words so no need to split again. .
            w = word.strip('.!?,()\n').lower()
            record[w]= record.get(w,0)+1

    for word in record.keys():
        if word in common_words:
            record[word] = -1     
    occur = [key for key in record.keys()]
    occur.sort(reverse=True, key=lambda x: record[x])
    return set(occur[: len(occur) // 10 ])

In [8]:
def calculate_score(sentence, metric):
    words = sentence.split()
    imp_words, total_words, begin_unimp, end, begin = [0]*5
    for word in words:
        w = word.strip('.!?,();').lower()
        end += 1
        if w in metric:
            imp_words += 1
            begin = total_words
            end = 0
        total_words += 1
    unimportant = total_words - begin - end
    if(unimportant != 0):
        return float(imp_words**2) / float(unimportant)
    return 0.0

In [9]:
def Luhn_summary(file_name,ABSTRACT_SIZE=0.3):
    sentences = readarticle(file_name)
    sentences = [" ".join(sentence) for sentence in sentences] #to make words list to sentence
    metric = top_words(sentences)
    scores = {}
    for sentence in sentences:
        scores[sentence] = calculate_score(sentence, metric)
    top_sentences =list(sentences) # make a copy
    top_sentences.sort(key=lambda x: scores[x], reverse=True)      # sort by score
    top_sentences = top_sentences[:round(len(scores)*ABSTRACT_SIZE)] # get top 5% (in persentage)
    top_sentences.sort(key=lambda x: sentences.index(x))           # sort by occurrence
    return '. '.join(top_sentences) 


In [10]:
Luhn_summary("text.txt")

'In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills. Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services. As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses. The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning'

## 3. LSA ( cross )

In [15]:
def modified_tfidf(sentences , unique_words):
    tf_idf= np.zeros((len(sentences),len(unique_words)))
    tot_frequency=dict()
    i=0
    # frequency matrix or TF values
    for sentence in sentences:
        for word in sentence :
            if word in unique_words:
                j =unique_words.index(word)
                freq = tf_idf[i][j]
                if freq==0 :
                    tot_frequency[word]=tot_frequency.get(word,0)+1
                tf_idf[i][j]=freq+1
        i=i+1
    #print(tot_frequency)
    #binary=tf_idf
    # calculating IDF values for all the unique values
    x,y = tf_idf.shape
    idf={}
    for i in tot_frequency.keys():
        idf[i]=math.log(x/tot_frequency[i])
    #print(idf)
    # calculating tf_idf values 
    for i in range(x):
        for j in range(y):
            tf_idf[i][j] = tf_idf[i][j]*idf[unique_words[j]]
    
    # modified Tf_IDF approch making the less than average values to zero to remove noise
    sent_avg = np.mean(tf_idf,axis=1)
    #print("average= ",sent_avg)
    res=[]
    for i in range(x):
        res.append(list(np.greater(tf_idf[i],sent_avg[i]).astype("int")))
    #print(np.count_nonzero(tf_idf==0) , np.count_nonzero((res*tf_idf)==0))
    return res*tf_idf

In [16]:
def LSA_summary(filename,ABSTRACT_SIZE=0.3):
    sentences = readarticle(filename)
    uniqueWords=get_uniquewords(sentences)
    tf_idf_vectors=modified_tfidf(sentences,uniqueWords)
    #print(len(sentences), len(uniqueWords), tf_idf_vectors.shape)
    U,s,V = np.linalg.svd(np.transpose(tf_idf_vectors))
    V_avg=np.mean(V,axis=1)
    #print("avg= " ,V_avg)
    
    # redusing the noices again 
    res=[]
    for i in range(len(V_avg)):
        res.append(list(np.greater(V[i],V_avg[i]).astype("int")))
    V= V*res
    
    # geting the sentence length values
    Lengths = np.sum(V,axis=0)
    #print(Lengths)
    # Selecting the top sentences
    sents_ord=sorted(sentences,key=lambda x: Lengths[sentences.index(x)] , reverse=True)
    return (".".join([" ".join(i) for i in sents_ord[:round(len(sentences)*ABSTRACT_SIZE) ]]))

In [17]:
LSA_summary("text.txt")


'" The program aims to build up the cognitive skills and in-depth understanding of developing intelligent cloud connected solutions for applications across industry.Earlier in April this year, the company announced Microsoft Professional Program In AI as a learning track open to the public.In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills.'

## 4 . Fuzzi logic

In [1]:
from fuzzyLogic.summerize import fuzzy_summary

In [2]:
fuzzy_summary("text.txt")

before feature calculation
feature 1 done
feature 2 done
feature 3 done
feature 4 done
feature 5 done
feature 6 done
feature 7 done
feature 8 done
after feature calculation ... going into results calculation
after results vector is done almost done.... 
	


['Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services.',
 'The program is an attempt to ramp up the institutional set-up and build capabilities among the educators to educate the workforce of tomorrow."',
 'The program aims to build up the cognitive skills and in-depth understanding of developing intelligent cloud connected solutions for applications across industry.']

In [3]:
import nltk

In [4]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lokanadh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [6]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lokanadh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

## 5.Hybrid method OWN (under dev..) 

In [None]:
from sklearn.cluster import KMeans

In [None]:
def select_kvalue(vectors):
    # using yellow brickes package
    
    return k

In [83]:
def def_mean(pos,debug):
    l=len(pos)
    if(l<=1):
        return pos[0]
    else:
        su=0
        for i in range(1,len(pos)):
            su=su+(pos[i]-pos[i-1])
        return su/(l-1)

In [87]:
def outlier_removed_mean(pos,avg,debug):
    if(debug):
        print(pos)
    l=len(pos)
    if(l==1):
        return pos[0]
    left=pos[1]-pos[0]
    right=pos[-1]-pos[-2]
    if(left<=avg and right<=avg):
        if(debug):
            print("in equal")
        return sum(pos)/l
    if(left>=right):
        return outlier_removed_mean(pos[1:],avg,debug)
    if(right>left):
        return outlier_removed_mean(pos[:-1],avg,debug)

In [88]:
def find_flow(clusters,k,debug=False):
    count={}
    first_pos={}
    last_pos={}
    pos={}
    for i in range(1,k):
        count[i]=0
        first_pos[i]=clusters.index(i)# calculating first position
        last_pos[i]=0
        pos[i]=[]
    for i in range(len(clusters)):
        count[clusters[i]]=count[clusters[i]]+1
        last_pos[clusters[i]]=i
        pos[clusters[i]].append(i)
    if(debug):
        print("counts=",count)
    #calculating spread (spread), positional average(pos_avg) , calculating the avgrage diff between the positions (pos_def_avg)
    spread={}
    pos_avg={}
    pos_def_mean={}
    for i in range(1,k):
        spread[i]=last_pos[i]-first_pos[i]
        pos_avg[i]=sum(pos[i])/len(pos[i])
        pos_def_mean[i]= def_mean(pos[i],debug=debug)
        
    #calculating the new Pos_mean after removing the position outliers (w.r.t pos_def_avg)
    final_pos={}
    for i in range(1,k):
        final_pos[i]=outlier_removed_mean(pos[i],pos_def_mean[i],debug=debug)
    if(debug):
        print("\npositions = ",pos,"\nnormal avg= ",pos_avg,"\npos_deff=",pos_def_mean,"\nfinal = " ,final_pos)
    flow=list(sorted(list(range(1,k)),key=lambda x: final_pos[x]))
    if(debug):
        print(flow)
    
    return flow,count

In [89]:
import numpy as np
l=np.random.randint(low=1,high=6,size=40)
print(l)

find_flow(list(l),6,debug=True)

[5 2 2 2 2 5 4 2 4 5 4 1 5 2 1 1 4 3 3 2 3 5 5 2 2 5 3 5 5 2 5 2 1 5 5 5 5
 3 1 2]
counts= {1: 5, 2: 12, 3: 5, 4: 4, 5: 14}
[11, 14, 15, 32, 38]
in equal
[1, 2, 3, 4, 7, 13, 19, 23, 24, 29, 31, 39]
[1, 2, 3, 4, 7, 13, 19, 23, 24, 29, 31]
in equal
[17, 18, 20, 26, 37]
[17, 18, 20, 26]
[17, 18, 20]
in equal
[6, 8, 10, 16]
[6, 8, 10]
in equal
[0, 5, 9, 12, 21, 22, 25, 27, 28, 30, 33, 34, 35, 36]
[5, 9, 12, 21, 22, 25, 27, 28, 30, 33, 34, 35, 36]
[9, 12, 21, 22, 25, 27, 28, 30, 33, 34, 35, 36]
[12, 21, 22, 25, 27, 28, 30, 33, 34, 35, 36]
[21, 22, 25, 27, 28, 30, 33, 34, 35, 36]
in equal

positions =  {1: [11, 14, 15, 32, 38], 2: [1, 2, 3, 4, 7, 13, 19, 23, 24, 29, 31, 39], 3: [17, 18, 20, 26, 37], 4: [6, 8, 10, 16], 5: [0, 5, 9, 12, 21, 22, 25, 27, 28, 30, 33, 34, 35, 36]} 
normal avg=  {1: 22.0, 2: 16.25, 3: 23.6, 4: 10.0, 5: 22.642857142857142} 
pos_deff= {1: 6.75, 2: 3.4545454545454546, 3: 5.0, 4: 3.3333333333333335, 5: 2.769230769230769} 
final =  {1: 22.0, 2: 14.181818181818182, 3: 18

([4, 2, 3, 1, 5], {1: 5, 2: 12, 3: 5, 4: 4, 5: 14})

In [113]:
def ratio_assertion(count,total_ratio,debug):#based on the cluster importances (pending)
    if(debug):
        print(sum(count.values()),"*ratio  ->  ",sum(d.values())*0.3)

    cl_ratios={}
    for i,j in count.items():
        cl_ratios[i]= round(j*total_ratio)
    if(debug):
        print("actual->",sum(cl_ratios.values()))
    return cl_ratios

In [114]:
d={1:32,2:23,3:7,4:12,5:98}
l=ratio_assertion(d,total_ratio=0.3,debug=True)
print(d,"    ",l)


172 *ratio  ->   51.6
actual-> 52
{1: 32, 2: 23, 3: 7, 4: 12, 5: 98}      {1: 10, 2: 7, 3: 2, 4: 4, 5: 29}


In [None]:
def hybrid_own_summery(file_name,ABSTRACT_SIZE=0.3,debug=False):
    sentences = readartical(file_name)
    uniqueWords=get_uniquewords(sentences)
    tf_idf_vectors=modified_tfidf(sentences,uniqueWords)
    K = select_kvalue(tf_idf_vectors)
    kmeans = KMeans(n_clusters=14).fit(text_tfidf)
    clusters = kmeans.labels_
    
    path_order,cl_count = find_flow(clusters,K)
    
    cluster_ratios = ratio_assertion(cl_count,ABSTRACT_SIZE)
    
    summery=[]
    for i in path_order:
        summery.extend(top_sentences_clusterWise(cluster=i,cluster_ratio = cluster_ratios[i]))

## 6. Sentence Embidings (ref)

In [1]:
### DATA SET AMAZON REVIEWS DATA SET (OPTIONAL)
### PRE TRIENDED 50 DIMENSSIONS EMBIDINGS(GLOVE)
### k-means clustering and selection based on the distance to the center of the cluster.

In [18]:
import numpy as np
import re
import string

Using TensorFlow backend.


In [15]:
def loadEmbeddingMatrix(EMBEDDING_FILE):
    embeddings_index = dict()
    #Transfer the embedding weights i dictionary by iterating through every line of the file.
    f = open(EMBEDDING_FILE,'r',encoding='utf-8')
    for line in f:
        #split up line into an indexed array
        values = line.split()
        #first index is word
        word = values[0]
        #store the rest of the values in the array as a new array
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs #50 dimensions
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))

    return embeddings_index #, embedding_matrix

## Loading 'glove' words
emb_index= loadEmbeddingMatrix('C:/Users/lokanadh/Desktop/Anaconda/text summerization/glove.6B.50d.txt')

Loaded 400000 word vectors.


In [60]:
def get_sent_embedding(wordlist):
    """
    This function calculates the embedding of each sentence in the review. Checks if the sentence being passed is a valid one, 
    removing the punctuation and emojis etc.
    """
    sent_emb = []
    for i in wordlist:
        i = i.lower()
        try :
            res=list(emb_index[i])
        except:
            res=list(emb_index['unknown'])
        sent_emb.append(res)

    #calculating the mean 
    sent_emb=np.mean(sent_emb,axis=0)
    return np.array(sent_emb)

In [66]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

def Embeding_summery(file_name):
    sentences =readarticle(file_name)
    emb_sents=[get_sent_embedding(sent) for sent in sentences]
    sentences=[" ".join(sent) for sent in sentences]
    n_clusters = int(np.ceil(len(emb_sents)**0.5))
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(emb_sents)
    avg = []
    closest = []
    for j in range(n_clusters):
        idx = np.where(kmeans.labels_ == j)[0]
        #print("IDX is: ", idx)
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,emb_sents)
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])
    summary = ' '.join([sentences[closest[idx]] for idx in ordering])
    return summary

Embeding_summery("text.txt")

'In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills " The program aims to build up the cognitive skills and in-depth understanding of developing intelligent cloud connected solutions for applications across industry This program also included developer-focused AI school that provided a bunch of assets to help build AI skills '

## 7.model

## TEST AREA