# Loading Data

In [1]:
import os
import re
import pandas as pd
import nltk
from nltk import word_tokenize
stopwords = open("stopwords.txt").read().splitlines()
def loadData(loc):
        BASEDIR=os.path.dirname(os.path.realpath('__file__'))
        df=pd.read_csv(BASEDIR+loc,names=["headline","news"])
        return df


In [2]:
import numpy as np

In [3]:
df=loadData("/dataset/onlineKhabar.csv")
REPLACE_BY_SPACE_RE = re.compile('[?!/(){}\[\]\|@,;\'\']')

# Text Pre-processing

In [4]:
def removeCharacs(text):
    text=re.sub(REPLACE_BY_SPACE_RE,' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    return text

In [5]:
flatten = lambda x : ''.join(x.split('\','))

In [6]:
df['news'] = df['news'].map(flatten).map(removeCharacs)


In [7]:
def stem_text_cat2(text):
    return re.sub('(मा|को|ले|बाट|का|हरु|हरुसँग|सँग|लाई|हरू|हरूसँग|हरू|पटक)','',text)

In [8]:
allwords = open("dict.txt").read().splitlines()

In [9]:
suffixes = ('ी' , 'ीया' ,  'ीय' ,'े' , 'नु' , 'दै' , 'ेको' , 'ै','कै')
def stem_text_cat1(word):
    if word.endswith(suffixes):
        wordlist =  [word.rstrip(x) for x in suffixes]
        for i in wordlist:
            if i!= word:
                if i in allwords:
                    return i
                    
        
    return word
    

In [10]:
def stemming(text):
    text =  ' '.join([stem_text_cat1(a) for a in word_tokenize(text)])
    return ' '.join([stem_text_cat2(a) for a in word_tokenize(text)])
    

In [11]:
df['news'] = df['news'].map(stemming)
df['fk'] = df['news']

In [12]:
#Tokenize sentence
import nltk

In [13]:
def sent_tokenize(text):
    sentence = [x for x in text.split('।')] 
    return sentence

In [14]:
df['news']=df['news'].map(sent_tokenize)

In [15]:
df['original'] = df['news']

In [16]:
def check(x):
    try:
        if isinstance(float(x), float):
            return False
    except:
        return True


In [17]:
# function to remove stopwords
def remove_stopword(sen):
    sen_new = " ".join([i for i in sen if i not in stopwords and check(i)])
    return sen_new

In [18]:
remove_stopwords = lambda sent : [remove_stopword(r.split()) for r in sent]


In [19]:
df['news']=df['news'].map(remove_stopwords)


# TextRank Algorithm

In [20]:
# TF-IDF approaches to create features for our sentences

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
def tfidf(corpus):
    vectorizer = TfidfVectorizer(max_df = 5 , min_df = 1)
    X = vectorizer.fit_transform(corpus)
    stopwords = vectorizer.get_stop_words()
    return X
    

In [23]:
df['news_vector']=df['news'].map(tfidf)


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
def similarity(sentences):
    sim_mat = np.zeros([sentences.shape[0],sentences.shape[0]])
    for i in range(sentences.shape[0]):
          for j in range(sentences.shape[0]):
                if i != j:
                      sim_mat[i][j] = cosine_similarity(sentences[i], sentences[j])[0,0]
                        
    return sim_mat

In [26]:
df['similarity']=df['news_vector'].map(similarity)
#df['bm-similarity'] = df['news'].map(bm_similarity)

In [27]:
import networkx as nx


In [28]:
ranks=[]
for i in range(len(df)):
    nx_graph = nx.from_numpy_array(df['similarity'][i])
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[k],s) for k,s in enumerate(df['original'][i])), reverse=True)
    ranks.append((ranked_sentences[0:3]))
    

In [29]:
summaries = pd.DataFrame(ranks,columns = ['Summary1','Summary2','Summary3'])

# RAKE algorithm for keyword extraction

In [30]:
# function to remove stopwords
def remove_stopword(sen):
    sen_new = " ".join([i for i in sen if i not in stopwords and check(i)])
    return sen_new
remove_stopwords = lambda sent : [remove_stopword(r.split()) for r in sent]
def remove_stopwords_w_num(sen):
    sen_new = " ".join([i for i in sen if i not in stopwords])
    return sen_new
remove_stopwords_w_num = lambda sent : [remove_stopword(r.split()) for r in sent]

In [31]:
df['news-ws'] = df['news'].map(remove_stopwords)


In [32]:
def checkMe(x):
    try:
        if isinstance(float(x), float):
            return True
    except:
        return False
def checkNum(x):
    try:
        if isinstance(float(x), float):
            return 
    except:
        return x

def findCandidates(text):
    clusters=[]
    word1=[]
    for word in word_tokenize(text):
        if (word in stopwords) or checkMe(word):
            clusters.append(word1)
            word1 =[]
            continue
        else:
            word1.append(word)
    return [' '.join(a).replace('।','') for a in clusters if len(a)>1]
df['kW-candidates'] = df['fk'].map(findCandidates)

In [33]:
from collections import OrderedDict
def find_scores(text):
    document = [checkNum((word_tokenize(x))) for x in text]
    vocab = set([x for x in ' '.join(text).split(' ') ])
    vocab = list(vocab)
    names = vocab
    occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)

# Find the co-occurrences:
    for l in document:
        for i in range(len(l)):
            for item in l[:i] + l[i + 1:]:
                occurrences[l[i]][item] += 1
                
                
    rows = ['deg','freq','deg/freq']
    word_scores = OrderedDict((name, OrderedDict((r, 0) for r in rows)) for name in names)
    
    
    for key in word_scores.keys():
        word_scores[key]['deg'] = sum(occurrences[key].values())
        word_scores[key]['freq'] = len([x for x in occurrences[key].values() if x!=0])
        if word_scores[key]['freq']!=0:
            word_scores[key]['deg/freq'] = word_scores[key]['deg']/word_scores[key]['freq']
        else:
            word_scores[key]['deg/freq']= 0
            
            
    scoredCandidates = []
    for sent in range(len(text)):
        sumis=0
        for word in word_tokenize(text[sent]):
            sumis += word_scores[word]['deg/freq']
        scoredCandidates.append((text[sent],sumis/len(word_tokenize(text[sent]))))
        
    return sorted(list(set(scoredCandidates)), key = lambda x : x[1],reverse=True)


In [34]:
def update(text):
    keyword = []
    count = 0
    for i in text:
        if count ==0:
            keyword.append(i[0])
        
        else:
            maxlen = max(len(i[0]),len(keyword[-1]))
            if (nltk.edit_distance(i[0],keyword[-1])/maxlen)<=0.6:
                pass
            else:
            
                keyword.append(i[0])
            
        count+=1
        
    return keyword
        

In [35]:
df['scored'] = df['kW-candidates'].map(find_scores)

In [36]:
df['n-scored'] = df['scored'].map(update)
keywords = lambda text : ' '.join(set([x for a in text[:5] for x in a.split(' ')]))
df['keywords'] = df['n-scored'].map(keywords)

# Similarity Between Top Sentences and Keywords

In [37]:
from difflib import SequenceMatcher
summary_sentence = [] 
for i in range(len(df)):
    ratio=[]
    text1 = df['keywords'][i]
    for j in range(3):
        text = summaries['Summary'+str(j+1)][i][1]
        ratio.append(SequenceMatcher(None, text, text1).ratio())
        
    index = ratio.index(max(ratio))
    
    summary_sentence.append(summaries['Summary'+str(index+1)][i])
    
    
    #print(index,summary_sentence)

In [38]:
summary_sentence = [] 
for i in range(len(df)):
    ratio=[]
    text1 = df['keywords'][i]
    for j in range(3):
        text = summaries['Summary'+str(j+1)][i][1]
        ratio.append(SequenceMatcher(None, text, text1).ratio())
        
    index = ratio.index(max(ratio))
    
    summary_sentence.append(summaries['Summary'+str(index+1)][i])
    
    
    #print(index,summary_sentence)

In [39]:
print(summary_sentence[1]) 

(0.10294528052017053, ' आगामी सेप्टेम्बरसम्म नयाँ अपरेटिङ सिस्टम आइओएस १३ सब एप्पल प्रयोगकर्ता लागि उपलब्ध गराउन योजना एप्पल छ ')


# Find Accuracy with ROGUE Measure

In [40]:
def find_recall(x,y):
    '''
    x: generated summary
    y: reference summary
    '''
    similarity = set(word_tokenize(x)).intersection(set(word_tokenize(y)))
    return len(similarity)/len(set(word_tokenize(y)))

In [41]:
def find_precision(x,y):
    '''
    x: generated summary
    y: reference summary
    '''
    similarity = set(word_tokenize(x)).intersection(set(word_tokenize(y)))
    return len(similarity)/len(set(word_tokenize(x)))

In [42]:
def find_rogue_accuracy( x , y ):
    print("ROGUE Precision is, ",find_precision(x,y))
    print("ROGUE Recall is, ",find_recall(x,y))
    

In [43]:
def stem_r_stopwords(text):
    
    text = stemming(text)
    return ' '.join([x for x in word_tokenize(text) if x not in stopwords])

In [44]:
def find_acc(i):
    find_rogue_accuracy(stem_r_stopwords(summary_sentence[i][1]),stem_r_stopwords(df['headline'][i]))
    print(summary_sentence[i][1],'***',df['headline'][i])

In [52]:
find_acc(16)

IndexError: list index out of range