In [None]:
# Imports
import re
import nltk
from nltk.tokenize.moses import MosesDetokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.text import Text, TextCollection

In [None]:
# Main idea behind generating stories and simplified stories is that you can do preprocessing on
# simplified_stories to help you choose sentences, but still include the original sentences in the
# summary (if you want).

# Read in data from target, breaking each story into paragraphs (and then sentences)
def load_data_as_paragraphs(file, stem=False, remove_stop_words=False, lowercase=False, metaparagraph_size=1):
    simplified_stories = []
    stories = []
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    # Load stories from file
    with open(file) as f:
        stories_raw = f.readlines()[:10] #TODO: After testing, remove [:10]
    for story in stories_raw:
        
        # Split into a list of paragraphs
        paragraphs = story.split("<newline>")
        simplified_paragraphs = []
        untokenized_paragraphs = []
        par_index = 0
        
        # Loop through paragraphs
        while par_index < len(paragraphs):
            meta_paragraph = []
            
            # Combine 
            while par_index < len(paragraphs) and len(meta_paragraph) < metaparagraph_size:
                paragraph = paragraphs[par_index]
                
                # Split paragraph into a list of sentences
                sentences = nltk.sent_tokenize(paragraph)
                meta_paragraph += sentences
                par_index += 1
            
            untokenized_paragraphs.append(meta_paragraph)
            # For the tokenized version, split each sentence into a list of words
            paragraph_tokenized = [nltk.word_tokenize(sentence) for sentence in meta_paragraph]
            # Extra preprocessing
            if remove_stop_words:
                paragraph_tokenized = [[word for word in sentence if word not in stop_words] for sentence in paragraph_tokenized]
            if stem:
                paragraph_tokenized = [[stemmer.stem(word) for word in sentence] for sentence in paragraph_tokenized]
            if lowercase:
                paragraph_tokenized = [[word.lower() for word in sentence] for sentence in paragraph_tokenized]
                
            simplified_paragraphs.append(paragraph_tokenized)
        stories.append(untokenized_paragraphs)
        simplified_stories.append(simplified_paragraphs)
    return stories, simplified_stories



In [None]:
# SumBasic algorithm


# Compute word probablities
def get_probs(data, tfidf=False, text=None):
    probs = {}
    if not tfidf:
        # Loop through each word
        for sentence in data:
            for word in sentence:
                # Build a dictionary of word frequency counts
                if word in probs:
                    probs[word] += 1
                else:
                    probs[word] = 1
        N = sum([len(sentence) for sentence in data]) * 1.0
        
        # Each word's score is word_count/total_word_count
        for key in probs.keys():
            probs[key] /= N
    else:
        # Otherwise each word's score is its tf_idf
        # NOTE: tf_idf was calculated over all stories, not just this one.
        for sentence in data:
            for word in sentence:
                probs[word] = all_tokens.tf_idf(word, text)
    return probs 

# Pick the best scoring sentence that contains the highest probability word.
def get_best_sentence(data, probs, include_best=True):
    highest_prob_word = max(probs, key=probs.get) 
    
    # Comment this out later, but for testing it's really useful to see what
    # words are being marked as most important
    print(highest_prob_word)
    best_sentence_index = -1
    best_score = 0.0
    for index, sentence in enumerate(data):
        if (highest_prob_word in sentence) or not include_best:
            score = sum([probs[word] for word in sentence])/len(sentence)
            if score > best_score:
                best_score = score
                best_sentence_index = index  
    return best_sentence_index


# Square the probablity of each word in the chosen sentence
def update_probs(probs, sentence):
    for word in set(sentence):
        probs[word] = probs[word] ** 2
    return probs


# Count sentences in text
def get_length(text):
    return len(text)

# Get tf_idf across all stories
def get_tfidf(data):
    text_list = [Text([word for sentence in story for word in sentence]) for story in data]
    story_tokens = TextCollection(text_list)
    return text_list, story_tokens

# Get tf_idf across all stories
def get_tfidf(data, by_paragraph=False):
    # If get by paragraph, each text refers to 1 paragraph
    if by_paragraph:
        text_list = [Text([word for sentence in paragraph for word in sentence]) for story in data for paragraph in story]
    # Otherwise each text is 1 story
    else:
        text_list = [Text([word for paragraph in story for sentence in paragraph for word in sentence]) for story in data]
    story_tokens = TextCollection(text_list)
    return text_list, story_tokens

In [None]:
# Actually run everything
stem = True
remove_stop_words = True
lowercase = True
metaparagraph_size = 5

# stories is a triply nested lists (first broken by story, then by paragraph, then by sentences)
# simplified_stories is a quadruply nested list (broken by story, paragraph, sentence, word)
stories, simplified_stories = load_data_as_paragraphs("writingPromptsData/examples.wp_target", stem, remove_stop_words, lowercase, metaparagraph_size)

#TODO: If necessary, introduce other cleaning things:
# - Clean up quotes
# - Deal with parens unmatched
# - remove punctuation?

In [None]:
summaries = []
detokenizer = MosesDetokenizer()
tfidf = True
by_paragraph = True
include_best_word = False

texts, all_tokens = get_tfidf(simplified_stories, by_paragraph)

# Loop through stories (assumes each story is a list of paragraphs, each of which are lists of sentences)
for story, simplified_story, text in zip(stories, simplified_stories, texts):
    summary = []
    # Loop through paragraphs, adding one sentence per paragraph to the summary.
    for index, paragraph, simplified_paragraph in zip(range(len(story)), story, simplified_story):
        # Get word probabilities
        probs = get_probs(simplified_paragraph, tfidf, text) 
        # Choose sentence with best score
        next_sentence_index = get_best_sentence(simplified_paragraph, probs, include_best=include_best_word)
        # Add it to summary
        summary.append(paragraph[next_sentence_index])
    # Join sentences into a summary
    summary_string = "<newline>".join(summary)
    print(summary_string)
    print(" ===== ")
    summaries.append(summary_string)
