In [24]:
# Import dependencies
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import pandas as pd
import networkx as nx
import random

In [87]:
# Read text file article
def get_sentences(filetext):
    sentences = []
    
    for sentence in filetext:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split())
        
    sentences.pop()
    
    return sentences

def get_sentence_from_file(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    filedata_combined = "".join([paragraph for paragraph in filedata])
    filetext = filedata_combined.split(". ")
    sentences = get_sentences(filetext)
    return sentences

In [49]:
def get_sentence_from_dataset():
    df = pd.read_csv('./bbc-news-data.csv')
    df = df.dropna(how='any',axis=0) 
    random_no = random.randint(0, len(df) - 1)

    # Get random article text from Data Set
    article = df['content'][random_no].split(". ")

    sentences = get_sentences(article)
    return sentences

In [34]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [word.lower() for word in sent1]
    sent2 = [word.lower() for word in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [21]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    
    return similarity_matrix

In [85]:
def generate_summary(file_name='', top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text and tokenize
    # If using text from a dataset, use method below and customize it to fit your dataset
    # sentences =  get_sentence_from_dataset()
    
#     If using text from a .txt file, use method below
    sentences = get_sentence_from_file(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Output the summarize text
    print("Summarize Text: \n", ". ".join(summarize_text))

In [90]:
generate_summary("002.txt", 5)

Indexes of top ranked_sentence order are  [(0.14039469398100238, ['Dollar', 'gains', 'on', 'Greenspan', 'speech', 'The', 'dollar', 'has', 'hit', 'its', 'highest', 'level', 'against', 'the', 'euro', 'in', 'almost', 'three', 'months', 'after', 'the', 'Federal', 'Reserve', 'head', 'said', 'the', 'US', 'trade', 'deficit', 'is', 'set', 'to', 'stabilise.', 'And', 'Alan', 'Greenspan', 'highlighted', 'the', 'US', "government's", 'willingness', 'to', 'curb', 'spending', 'and', 'rising', 'household', 'savings', 'as', 'factors', 'which', 'may', 'help', 'to', 'reduce', 'it']), (0.13029461996080488, ["China's", 'currency', 'remains', 'pegged', 'to', 'the', 'dollar', 'and', 'the', 'US', "currency's", 'sharp', 'falls', 'in', 'recent', 'months', 'have', 'therefore', 'made', 'Chinese', 'export', 'prices', 'highly', 'competitive']), (0.10564213008326902, ['Market', 'concerns', 'about', 'the', 'deficit', 'has', 'hit', 'the', 'greenback', 'in', 'recent', 'months']), (0.09127819190589045, ['On', 'Friday,',