In [None]:
# Imports
import re
import nltk
from nltk.tokenize.moses import MosesDetokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.text import Text, TextCollection

In [None]:
# # Read in data, source and target

# def load_data(file_source, file_target):
#     source_stories = []
#     target_stories = []

#     with open(file_source) as f:
#         source_raw = f.readlines()
#     with open(file_target) as f:
#         target_raw = f.readlines()

#     for source, target in zip(source_raw, target_raw):
#         # If the story starts with [ CONTEST ] remove it all together
#         if source[:11] == "[ CONTEST ]":
#             print(source)
#             print(target)
#             pass

#         # Remove prefix [ XX ] and ( XX ) from the phrase (this is typically at the beginning or end of the string)
#         source = re.sub("(\[|\() [A-Za-z][A-Za-z] (\]|\))", '', source)
#         source_stories.append(nltk.word_tokenize(source))
#         target_stories.append(nltk.word_tokenize(target))
#     print(len(stories))

In [None]:
# # Read in data from source

# def load_data(file_source):
#     stories = []

#     with open(file_source) as f:
#         stories_raw = f.readlines()

#     for story in stories_raw:
#         # If the story starts with [ CONTEST ] remove it all together
#         if story[:11] == "[ CONTEST ]":
#             pass

#         # Remove prefix [ XX ] and ( XX ) from the phrase (this is typically at the beginning or end of the string)
#         story = re.sub("(\[|\() [A-Za-z][A-Za-z] (\]|\))", '', story)
#         stories.append(nltk.word_tokenize(story))
#     return stories

In [None]:
# # Read in data from target

# def load_data(file, stem=False, remove_stop_words=False):
#     stories = []
#     stemmer = PorterStemmer()
#     stop_words = set(stopwords.words('english'))
    
#     with open(file) as f:
#         stories_raw = f.readlines()[:10]
#     for story in stories_raw:
#         story = re.sub("< newLine >", "\n", story)
#         sentences = nltk.sent_tokenize(story)
#         story_tokenized = [nltk.word_tokenize(sentence) for sentence in sentences]
#         if remove_stop_words:
#             story_tokenized = [[word for word in sentence if word not in stop_words] for sentence in story_tokenized]
#         if stem:
#             story_tokenized = [[stemmer.stem(word) for word in sentence] for sentence in story_tokenized]
        
#         stories.append(story_tokenized)
#     return stories

# Read in data from target

def load_data(file, stem=False, remove_stop_words=False):
    simplified_stories = []
    stories = []
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    with open(file) as f:
        stories_raw = f.readlines()[:10]
    for story in stories_raw:
        paragraphs = story.split("< newLine > < newLine >")
        simplified_paragraphs = []
        untokenized_paragraphs = []
        for paragraph in paragraphs:
            paragraph = re.sub("< newLine >", "\n", paragraph)
            sentences = nltk.sent_tokenize(paragraph)
            untokenized_paragraphs.append(sentences)
            paragraph_tokenized = [nltk.word_tokenize(sentence) for sentence in sentences]
            if remove_stop_words:
                paragraph_tokenized = [[word for word in sentence if word not in stop_words] for sentence in paragraph_tokenized]
            if stem:
                paragraph_tokenized = [[stemmer.stem(word) for word in sentence] for sentence in paragraph_tokenized]

            simplified_paragraphs.append(paragraph_tokenized)
        stories.append(untokenized_paragraphs)
        simplified_stories.append(simplified_paragraphs)
    return stories, simplified_stories



In [None]:
# Tokenize
s = load_data("writingPromptsData/train.wp_target")


In [None]:

# SumBasic algorithm
def get_probs(data, tfidf=False, text=None):
    probs = {}
    if not tfidf:
        for sentence in data:
            for word in sentence:
                if word in probs:
                    probs[word] += 1
                else:
                    probs[word] = 1
        N = sum([len(sentence) for sentence in data]) * 1.0
        for key in probs.keys():
            probs[key] /= N
    else:
        for sentence in data:
            for word in sentence:
                probs[word] = all_tokens.tf_idf(word, text)
    return probs
            

def get_best_sentence(data, probs):
    best_sentence_index = -1
    best_score = 0.0
    for index, sentence in enumerate(data):
        score = sum([probs[word] for word in sentence])/len(sentence)
        if score > best_score:
            best_score = score
            best_sentence_index = index
    return best_sentence_index


def get_best_word():
    pass

def update_probs(probs, sentence):
    for word in set(sentence):
        probs[word] = probs[word] ** 2
    return probs

# Count words in text
# def get_length(text):
#     return sum([len(sentence) for sentence in text])

# Count sentences in text
def get_length(text):
    return len(text)

def get_tfidf(data):
    text_list = [Text([word for sentence in story for word in sentence]) for story in data]
    story_tokens = TextCollection(text_list)
    return text_list, story_tokens

In [None]:
# Actually run everything
stem = True
remove_stop_words = True
stories, simplified_stories = load_data("writingPromptsData/train.wp_target", stem, remove_stop_words)

#TODO: If necessary, introduce other cleaning things:
# - Clean up quotes in tetokenized story (or better yet, never tokenize at all)
# - Deal with parens unmatched


In [None]:
print(stories[0])
print(simplified_stories[0])

In [None]:
summaries = []
detokenizer = MosesDetokenizer()
tfidf = True
max_length = 1

texts, all_tokens = get_tfidf(simplified_stories)
    


for story, simplified_story, text in zip(stories, simplified_stories, texts):
    print("NEW STORY", len(story))
    summary = []
    for index, paragraph, simplified_paragraph in zip(range(len(story)), story, simplified_story):
        probs = get_probs(simplified_paragraph, tfidf, text)     
        next_sentence_index = get_best_sentence(simplified_paragraph, probs)
        summary.append(paragraph[next_sentence_index])
        probs = update_probs(probs, simplified_paragraph[next_sentence_index])
    summary_string = " ".join(summary)
    print(summary_string)
    print(" ")
    summaries.append(summary_string)


In [None]:
# Original sumBasic (assumes stories are lists of sentences, not lists of paragraphs of sentences)
for index, story, simplified_story, text in zip(range(len(stories)), stories, simplified_stories, texts):
    summary = []
    probs = get_probs(simplified_story, tfidf, text)
    while get_length(summary) <  max_length:
        next_sentence_index = get_best_sentence(simplified_story, probs)
        summary.append(story[next_sentence_index])
        probs = update_probs(probs, simplified_story[next_sentence_index])
#     summary_string = " ".join([detokenizer.detokenize(sentence, return_str=True) for sentence in summary])
    print(summary)
    summary_string = " ".join(summary)
    print(summary_string)
    print(" ")
    summaries.append(summary_string)