In [23]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from os import listdir
import string

In [8]:
def load_doc(filename):
    file = open(filename, encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [9]:
def split_story(doc):
    index = doc.find('@highlight')
    story, highlights = doc[:index], doc[index:].split('@highlight')
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story,highlights

In [13]:
def load_stories(directory):
    all_stories = []
    for name in listdir(directory):
        filename = directory + '/' + name
        doc = load_doc(filename)
        story,highlight = split_story(doc)
        all_stories.append({'story':story, 'highlights':highlight})
    return all_stories

In [21]:
def clean_lines(lines):
    cleaned = []
    # Prepare a tranlation table to remove punctuations
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        index = line.find('(CNN) -- ')
        # Strip source CNN office if it exists
        if index > -1:
            line = line[index+len('(CNN)'):]
        # Tokenize on white space
        line = line.split()
        # Convert to lower case
        line = [word.lower() for word in line]
        # Remove punctuation from each token
        line = [w.translate(table) for w in line]
        # Remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # Store as a string
        cleaned.append(' '.join(line))
    # Remove empty strings
    cleaned = [c for c in cleaned if len(c) > 0]
    return cleaned

In [14]:
# Load stories
directory = '/home/sunishka/Downloads/NLP/cnn_stories/cnn/stories'
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [42]:
print(stories[9]['story'])
print("------------------------------------------------")
print(stories[9]['highlights'])

['we know the devastating force of class hurricanes which have sustained winds exceeding miles per hour or meters per second like hurricane katrina', 'now imagine winds that are times faster stripping a galaxy of its future light and heat devastating doesnt begin to describe it', 'data from the new atacama large millimeter array a growing array of radio telescopes in the high desert of chile have mapped a superwind flowing out of a nearby galaxy', 'this galaxy named ngc because it is the object in the new general catalog of galaxies is a bit like our own milky way galaxy in that it has a large disk of cold gas atoms and molecules of matter out of which stars are constantly forming', 'but ngc is a galaxy on steroids it is forming stars at about times the rate of the milky way thats why its called a starburst galaxy this made it a great target to observe with alma which can see light from the gas from which those stars form', 'this light is not visible with the human eye its electromagne

In [22]:
# clean stories
for example in stories:
    example['story'] = clean_lines(example['story'].split('\n'))
    example['highlights'] = clean_lines(example['highlights'])


In [32]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # Build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # Build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [37]:
def build_similarity_matrix(sentences,stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            # Ignore if both sentences are same
            if idx1 == idx2:
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [40]:
def generate_summary(sentences, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Sort the rank and pick top n sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
        summarize_text.append("".join(ranked_sentence[i][1]))

    # Offcourse, output the summarize text
    print("Summarize Text: \n", ". ".join(summarize_text))

In [41]:
# Printing an example story
sentences = stories[9]['story']
generate_summary(sentences)

Indexes of top ranked_sentence order are  [(0.044376276617326446, 'alberto bolatto of the university of maryland who led the alma study of ngc explained for the first time we can clearly see massive concentrations of cold molecular gas being jettisoned by expanding shells of intense pressure created by young stars the amount of gas we measure gives us very convincing evidence that some growing galaxies blow out more gas than they take in slowing star formation down to a crawl'), (0.044359396836348486, 'lets delve more into this mystery by looking at galaxies grow over the billion years since the big bang origin of the universe familiar building blocks of matter electrons protons and neutrons cooled and combined to form atoms then as this gas cooled further some of the atoms combined to form molecules meanwhile gravity amplified regions of high density so stars and whole galaxies formed out of the cold dense gas'), (0.04418493637455351, 'people often debate the value of astronomy it won