In [1]:
# Imports
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import regex
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import numpy as np

In [10]:
# Main idea behind generating stories and simplified stories is that you can do preprocessing on
# simplified_stories to help you choose sentences, but still include the original sentences in the
# summary (if you want).

# Read in data from target, breaking each story into paragraphs (and then sentences)
def load_data_as_paragraphs(file, stem=True, remove_stop_words=True, 
                            remove_punctuation=True, metaparagraph_size=5):
    simplified_stories = []
    stories = []
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
        
    # Load stories from file
    with open(file) as f:
        stories_raw = f.readlines()[0:2000]
        
    for story in stories_raw:
        # Split into a list of paragraphs
        paragraphs = story.split("<newline>")
        simplified_paragraphs = []
        untokenized_paragraphs = []
        par_index = 0
        
        # Loop through paragraphs
        while par_index < len(paragraphs):
            meta_paragraph = []
            
            # Combine small paragraphs into meta_paragraphs with at least some minimum number of sentences
            while par_index < len(paragraphs) and len(meta_paragraph) < metaparagraph_size:
                paragraph = paragraphs[par_index]
                
                # Split paragraph into a list of sentences
                sentences = nltk.sent_tokenize(paragraph)
                meta_paragraph += sentences
                par_index += 1
            

            
            meta_paragraph_unprocessed = meta_paragraph
            
            if remove_stop_words:
                meta_paragraph = [sentence.replace("<num>"," ") for sentence in meta_paragraph]
            
            # For the tokenized version, split each sentence into a list of words
            paragraph_tokenized = [nltk.word_tokenize(sentence) for sentence in meta_paragraph]
            # Extra preprocessing
            if remove_stop_words:
                paragraph_tokenized = [[word for word in sentence if word not in stop_words] for sentence in paragraph_tokenized]
            if remove_punctuation:
                paragraph_tokenized = [[regex.sub('[\p{P}\p{Sm}`]+', '', word) for word in sentence] for sentence in paragraph_tokenized]
                paragraph_tokenized = [[word for word in sentence if word != ""] for sentence in paragraph_tokenized]
            if stem:
                paragraph_tokenized = [[stemmer.stem(word) for word in sentence] for sentence in paragraph_tokenized]

            if len(meta_paragraph) < metaparagraph_size and len(untokenized_paragraphs) > 0:
                untokenized_paragraphs[-1] += meta_paragraph_unprocessed
                simplified_paragraphs[-1] += paragraph_tokenized
            else:
                untokenized_paragraphs.append(meta_paragraph_unprocessed)
                simplified_paragraphs.append(paragraph_tokenized)
                
        stories.append(untokenized_paragraphs)
        simplified_stories.append(simplified_paragraphs)
    return stories, simplified_stories

In [11]:
# SumBasic algorithm

# Pick the best scoring sentence that optionally contains the highest probability word.
def get_best_sentence(data, document_scores, document_index, vocab, inverse_vocab, sentence_vectorizer, include_best=True):
    
    # Concatenate tokens into strings
    strings = [" ".join(sentence) for sentence in data]
    
    # Create a bag-of-words-style sentence vector
    vector_sentences = sentence_vectorizer.transform(strings)
    
    # Dot the sentence vector with the document tf_idf vector
    curr_doc_scores = document_scores[document_index].transpose()
    scores = vector_sentences * curr_doc_scores
    
    # Divide each sentence's score by its length
    lengths = 1.0 / vector_sentences.sum(axis=1)
    scores = scores.multiply(lengths)
    
    # If we have to include the best word, mask invalid sentences
    # If we aren't including best, might as well leave this commmented out.
#     if include_best:
#         highest_score_word = inverse_vocab[document_scores[document_index].argmax()]
#         highest_score_mask = [1 if highest_score_word in sentence else 0 for sentence in data]
#         highest_sparse = csr_matrix(highest_score_mask).transpose()
#         scores = scores.multiply(highest_sparse)
    
    if scores.count_nonzero() == 0:
        return 0
        
    # Return the index of the best-scoring sentence
    best = scores.argmax(axis=0)     
    return best[0,0]

def get_best_sentence2(data, document_scores, document_index, vocab, inverse_vocab, whatever, include_best=True):
    highest_score_word = inverse_vocab[document_scores[document_index].argmax()]
    
    best_sentence_index = 0
    best_score = -1
    
    for index, sentence in enumerate(data):
        if not include_best or (highest_score_word in sentence):            
            if len(sentence) == 0:
                score = 0
            else:
                score = sum([document_scores[document_index, vocab[word]] for word in sentence])/len(sentence)
            
            if score > best_score:
                best_score = score
                best_sentence_index = index 
                
    return best_sentence_index

# Square the score of each word in the chosen sentence. Not currently used, but could be in the future.
def update_probs(document_scores, vocab, sentence):    
    for word in set(sentence):
        document_scores[vocab[word]] **= 2
    return document_scores

def construct_text_collection(simplified_stories, by_paragraph=False):
    # If get by paragraph, each element refers to 1 paragraph
    if by_paragraph:
        texts = [[word for sentence in paragraph for word in sentence] for story in simplified_stories for paragraph in story]
    # Otherwise each element is 1 story
    else:
        texts = [[word for paragraph in story for sentence in paragraph for word in sentence] for story in simplified_stories]
    
    return texts

def compute_all_probs(texts):
    tfidf = TfidfVectorizer(analyzer='word', tokenizer=lambda x: x,
                            preprocessor=lambda x: x,
                            norm='l1', use_idf=False, token_pattern=r"(?u)\b[^\s]+\b")
    scores = tfidf.fit_transform(texts)
    return tfidf, scores
    

def compute_all_tfidfs(texts):
    probs = TfidfVectorizer(analyzer='word', tokenizer=lambda x: x,
                            preprocessor=lambda x: x, 
                            token_pattern=r"(?u)\b[^\s]+\b")
    scores = probs.fit_transform(texts)
    return probs, scores
    
def compute_all_scores(texts, tfidf=True):
    if tfidf:
        return compute_all_tfidfs(texts)
    else:
        return compute_all_probs(texts)
        

In [12]:
# Actually run everything
stem = True
remove_stop_words = True
remove_punctuation = True
metaparagraph_size = 5

# stories is a triply nested lists (first broken by story, then by paragraph, then by sentences)
# simplified_stories is a quadruply nested list (broken by story, paragraph, sentence, word)
stories, simplified_stories = load_data_as_paragraphs("../datasets/writing_prompts/valid.wp_target", stem, remove_stop_words, 
                                                      remove_punctuation, metaparagraph_size)

#TODO: If necessary, introduce other cleaning things:
# - Deal with parens unmatched

In [13]:
def outlines():
    summaries = []
    tfidf = True
    by_paragraph = True
    include_best_word = False

    texts = construct_text_collection(simplified_stories, by_paragraph=by_paragraph)
    vectorizer, scores = compute_all_scores(texts, tfidf=tfidf)
    feature_names = vectorizer.get_feature_names()
    sentence_vectorizer = CountVectorizer(input='content', vocabulary=feature_names, token_pattern=r"(?u)\b[^\s]+\b") #r"(?u)\b[^\s]+\b"  #r"(?u)\b\w+\b"
    
    paragraph_index = 0

    # Loop through stories (assumes each story is a list of paragraphs, each of which are lists of sentences)
    for story_index, (story, simplified_story) in enumerate(zip(stories, simplified_stories)):
        summary = []

        # Loop through paragraphs, adding one sentence per paragraph to the summary.
        for paragraph, simplified_paragraph in zip(story, simplified_story):
            # indexing is done in a bit of a stupid way because csr matrices don't support indexing like
            # A[x][y] and instead require A[x,y].
            document_index = paragraph_index if by_paragraph else story_index

            # Choose sentence with best score
#             next_sentence_index = get_best_sentence(simplified_paragraph, scores, document_index, vectorizer.vocabulary_, feature_names, sentence_vectorizer, include_best=include_best_word)
            next_sentence_index = get_best_sentence2(simplified_paragraph, scores, document_index, vectorizer.vocabulary_, feature_names, sentence_vectorizer, include_best=include_best_word)
            # Add it to summary
            summary.append(paragraph[next_sentence_index])
            paragraph_index += 1
        # Join sentences into a summary
        summary_string = " <newline> ".join(summary)
        summaries.append(summary_string)
        
    with open('summaries_old.txt', 'w') as f:
        for summary in summaries:
            f.write(summary + "\n")
        
#     print("done")
        

In [None]:
# with include_Best
# orig: 67 ms
# new: 161 ms

# WITHOUT include_best
# orig: 127 ms
# new: 92 ms

# Original (before any improvements):  229ms

In [14]:
from timeit import default_timer as timer
start = timer()
outlines()
end = timer()
print("total", end - start)

total 13.531407639966346


In [7]:
%timeit outlines()

1.43 s ± 1.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
from line_profiler import LineProfiler

lp = LineProfiler()
lp.add_function(get_best_sentence)
lp_wrapper = lp(outlines)
lp_wrapper()
lp.print_stats()