In [None]:
# The python script version of this is a nicer in that it also uses multiprocessing for the 
# loading data aspect.

In [3]:
# Imports
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import regex
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import multiprocessing
import warnings

In [9]:
# Main idea behind generating stories and simplified stories is that you can do preprocessing on
# simplified_stories to help you choose sentences, but still include the original sentences in the
# summary (if you want).

# Read in data from target, breaking each story into paragraphs (and then sentences)
def load_data_as_paragraphs(file, stem=True, remove_stop_words=True, 
                            remove_punctuation=True, metaparagraph_size=5):
    simplified_stories = []
    stories = []
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
        
    # Load stories from file
    with open(file) as f:
        stories_raw = f.readlines()
        
    for story in stories_raw:
        # Split into a list of paragraphs
        paragraphs = story.split("<newline>")
        simplified_paragraphs = []
        untokenized_paragraphs = []
        par_index = 0
        
        # Loop through paragraphs
        while par_index < len(paragraphs):
            meta_paragraph = []
            
            # Combine small paragraphs into meta_paragraphs with at least some minimum number of sentences
            while par_index < len(paragraphs) and len(meta_paragraph) < metaparagraph_size:
                paragraph = paragraphs[par_index]
                
                # Split paragraph into a list of sentences
                sentences = nltk.sent_tokenize(paragraph)
                meta_paragraph += sentences
                par_index += 1
            

            
            meta_paragraph_unprocessed = meta_paragraph
            
            if remove_stop_words:
                meta_paragraph = [sentence.replace("<num>"," ") for sentence in meta_paragraph]
            
            # For the tokenized version, split each sentence into a list of words
            paragraph_tokenized = [nltk.word_tokenize(sentence) for sentence in meta_paragraph]
            # Extra preprocessing
            if remove_stop_words:
                paragraph_tokenized = [[word for word in sentence if word not in stop_words] for sentence in paragraph_tokenized]
            if remove_punctuation:
                paragraph_tokenized = [[regex.sub('[\p{P}\p{Sm}`]+', '', word) for word in sentence] for sentence in paragraph_tokenized]
                paragraph_tokenized = [[word for word in sentence if word != ""] for sentence in paragraph_tokenized]
            if stem:
                paragraph_tokenized = [[stemmer.stem(word) for word in sentence] for sentence in paragraph_tokenized]

            if len(meta_paragraph) < metaparagraph_size and len(untokenized_paragraphs) > 0:
                untokenized_paragraphs[-1] += meta_paragraph_unprocessed
                simplified_paragraphs[-1] += paragraph_tokenized
            else:
                if len(meta_paragraph) != 0:
                    untokenized_paragraphs.append(meta_paragraph_unprocessed)
                    simplified_paragraphs.append(paragraph_tokenized)
                
        stories.append(untokenized_paragraphs)
        simplified_stories.append(simplified_paragraphs)
    return stories, simplified_stories

In [14]:
# SumBasic algorithm

# Pick the best scoring sentence that optionally contains the highest probability word.
def get_best_sentence(data, document_scores, document_index, vocab, inverse_vocab, sentence_vectorizer):    
    # Create a bag-of-words-style sentence vector
    vector_sentences = sentence_vectorizer.transform(data)
    
    # Dot the sentence vector with the document tf_idf vector
    curr_doc_scores = document_scores[document_index].transpose()
    scores = vector_sentences * curr_doc_scores
    
    # Divide each sentence's score by its length. Zero length sentences will cause a warning of divide by zero
    # to occur
    lengths = 1.0 / vector_sentences.sum(axis=1)
    scores = scores.multiply(lengths)

    if scores.count_nonzero() == 0:
        return 0
        
    # Return the index of the best-scoring sentence
    best = scores.argmax(axis=0)     
    return best[0,0]

def construct_text_collection(simplified_stories, by_paragraph=False):
    # If get by paragraph, each element refers to 1 paragraph
    if by_paragraph:
        texts = [[word for sentence in paragraph for word in sentence] for story in simplified_stories for paragraph in story]
    # Otherwise each element is 1 story
    else:
        texts = [[word for paragraph in story for sentence in paragraph for word in sentence] for story in simplified_stories]
    
    return texts

def compute_all_probs(texts):
    tfidf = TfidfVectorizer(analyzer='word', tokenizer=lambda x: x,
                            preprocessor=lambda x: x,
                            norm='l1', use_idf=False, token_pattern=None)
    scores = tfidf.fit_transform(texts)
    return tfidf, scores
    
def compute_all_tfidfs(texts):
    probs = TfidfVectorizer(analyzer='word', tokenizer=lambda x: x,
                            preprocessor=lambda x: x, 
                            token_pattern=None, norm=None)
    scores = probs.fit_transform(texts)
    return probs, scores
    
def compute_all_scores(texts, tfidf=True):
    if tfidf:
        return compute_all_tfidfs(texts)
    else:
        return compute_all_probs(texts)   

In [10]:
stem = True
remove_stop_words = True
remove_punctuation = True
metaparagraph_size = 5

# stories is a triply nested lists (first broken by story, then by paragraph, then by sentences)
# simplified_stories is a quadruply nested list (broken by story, paragraph, sentence, word)
stories, simplified_stories = load_data_as_paragraphs("../datasets/writing_prompts/train.wp_target", stem, remove_stop_words, 
                                                      remove_punctuation, metaparagraph_size)

# Get the starting story index (i.e. starting paragraph index) for each story
lengths = [len(story) for story in stories]
story_indices = np.cumsum([0] + lengths[:-1])

In [11]:
chunksize = 10
tfidf = True
by_paragraph = True

texts = construct_text_collection(simplified_stories, by_paragraph=by_paragraph)
vectorizer, scores = compute_all_scores(texts, tfidf=tfidf)
feature_names = vectorizer.get_feature_names()
sentence_vectorizer = CountVectorizer(analyzer='word', tokenizer=lambda x: x, preprocessor=lambda x:x, 
                                      vocabulary=feature_names, token_pattern=None)
    
def summarize_story(inputs):
    story, simplified_story, story_index = inputs
    summary = []

    # Loop through paragraphs, adding one sentence per paragraph to the summary.
    for paragraph_index, (paragraph, simplified_paragraph) in enumerate(zip(story, simplified_story)):
        # indexing is done in a bit of a stupid way because csr matrices don't support indexing like
        # A[x][y] and instead require A[x,y].
        document_index = paragraph_index + story_index if by_paragraph else story_index
        
        # Choose sentence with best score
        next_sentence_index = get_best_sentence(simplified_paragraph, scores, document_index, vectorizer.vocabulary_, feature_names, sentence_vectorizer)

        # Add it to summary
        summary.append(paragraph[next_sentence_index])
    # Join sentences into a summary
    summary_string = " <newline> ".join(summary)
    return summary_string

def generate_summaries():
    # Don't print warnings about dividing by zero.
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    p = multiprocessing.Pool(multiprocessing.cpu_count())
    inputs = zip(stories, simplified_stories, story_indices)
    with open('summaries_train_first_half.txt', 'w') as f:
        for summary in p.imap(summarize_story, inputs, chunksize=chunksize):
            f.write(summary + "\n")

In [15]:
%%time
generate_summaries()

   <num>  world wars have passed , and now they feel like a simple sickeness that would pass by every so often , i could no longer evaluate the individual human as a being of its own , the importance of mortals is merely the same as the importance of my skin cells ; they are a part of a mechanism so much more advanced , a mechanism that is so dear to my fallen heart a mechanism that i have seen fall and rise so many times , a mechanism that when lost all of which it had , had me loosing my will to live , for the first time in all of my thousands years of existence . <newline>  i ca n't feel my legs , i have walked for days , just to hear the sound of gravel , crushed bones , crushed buildings and crushed civilizations under my steps to keep my sanity.. until i remembered , the day in my far past . <newline>   " i 'm ready to obey " i answered .
almost . <newline>   " evening , dale , " i say . <newline> and stuff . " <newline>   " whatever you say , wittell , " he says , not unkindly .