In [26]:
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
import math

In [1]:
with open('../dataset/text.txt') as f:
    text=f.read()

In [52]:
print (text[:300]) #document that we want to summarize
sentences=text.split('\n') # you can also use nltk sentence tokenizer 

Millions of gallons of crude oil that
spilled when a tanker ran aground spread across a wildlife-rich
stretch of ocean Saturday, and Alaska's chief environmental officer
criticized cleanup efforts as too slow.
   The biggest oil spill in U.S. history created a slick about
seven miles long and seven 


In [14]:
stopwords_list=list(stopwords.words()) #list of stopwords

In [21]:
def sentence_words(sent):
    return [ix for ix in word_tokenize(sent) if ix not in stopwords_list]

In [28]:
def edge_weight(words1,words2):
    rank = 0
    for w1 in words1:
        for w2 in words2:
            rank += int(w1 == w2)
    if rank == 0:
        return 0.0

    assert len(words1) > 0 and len(words2) > 0
    norm = math.log(len(words1)) + math.log(len(words2))
    if np.isclose(norm, 0.):
        # This should only happen when words1 and words2 only have a single word.
        # Thus, rank can only be 0 or 1.
        assert rank in (0, 1)
        return rank * 1.0
    else:
        return rank / norm

In [38]:
# Sentence similarity
s1=sentence_words(sentences[0])
s2=sentence_words(sentences[1])
print (s1,'||',s2) #when no words are similar 
print ('Sentence Similarity score :',edge_weight(s1,s2)) 

['Millions', 'gallons', 'crude', 'oil'] || ['spilled', 'tanker', 'ran', 'aground', 'spread', 'across', 'wildlife-rich']
Sentence Similarity score : 0.0


In [59]:
def create_matrix( document,damping=0.85):
    sentences_as_words = [sentence_words(sent) for sent in document]
    sentences_count = len(sentences_as_words)
    weights = np.zeros((sentences_count, sentences_count))

    for i, words_i in enumerate(sentences_as_words):
        for j, words_j in enumerate(sentences_as_words):
            weights[i, j] = edge_weight(words_i, words_j)
    weights /= weights.sum(axis=1)[:, np.newaxis]
    return np.full((sentences_count, sentences_count), (1.-damping) / sentences_count) \
            + damping * weights

In [60]:
C=create_matrix(sentences)

In [66]:
def power_method(matrix, epsilon=1e-4):
        transposed_matrix = matrix.T
        sentences_count = len(matrix)
        p_vector = np.array([1.0 / sentences_count] * sentences_count)
        lambda_val = 1.0

        while lambda_val > epsilon:
            next_p = np.dot(transposed_matrix, p_vector)
            lambda_val = np.linalg.norm(np.subtract(next_p, p_vector))
            p_vector = next_p

        return p_vector

In [67]:
C=create_matrix(sentences) # similar to probability matrix in page rank
ranking=power_method(C) #page rank 

In [71]:
#let's print the highest rated sentence
sentences[np.argmax(ranking)]

'Bay, Texas. However, that oil burned as well as spilled.'

In [82]:
def k_sentences(ranking,sents,k=5):
    idxs=[]
    sorted_rankings=sorted(enumerate(ranking),key=lambda x:x[1],reverse=True)
    for i in range(k):
        idxs.append(sorted_rankings[i][0])
    idxs=sorted(idxs)
    for ix in idxs:
        print (sentences[ix])

In [84]:
k_sentences(ranking,sentences,k=10)

   ``It is an enclosed body of water,'' he said. ``The only way for
shoreline will be contaminated,'' he said.
vessel's hull large enough to swim through, said Frank Iarossi,
to be done in a controlled manner,'' Wieliczkiewicz said. ``If it's
birds and pink salmon in that area,'' said Steve Goldstein, a
Sound. ``It's kind of like sailing through a zoo,'' said Jim
the world to clean up,'' said Riki Ott, chairman of the
their catches, she said.
   Previously, the largest U.S. tanker spill was the Dec. 15, 1976,
Bay, Texas. However, that oil burned as well as spilled.
