In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import model
import clean_blinkist
import format_books
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from nltk import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from scoring import find_ngrams, rouge_score

# How does the reference summary compare to the entire book?
1. Get a clean reference summary + clean book text
    - Moonwalking with Einstein
    - 4 Hour Workweek
    - Sapiens
    - Mindset
2. Comparison
    - Get document vector of reference summary
    - Get document vector of model-produced summary
    - Get document vector of entire book
    

In [3]:
ref_summary = model.load_data('blinkistsummarytxt/blinkistsapiens.txt')
reference_summary_cleaned = model.clean_line(ref_summary)
reference_summary = model.format_summary(reference_summary_cleaned)

In [4]:
book_txt = ref_summary = model.load_data('booktxt/Sapiens.txt')
sliced_book = format_books.get_sections(book_txt)
kinda_clean_book = [format_books.get_rid_of_weird_characters(section) for section in sliced_book]
more_clean_book = format_books.chapter_paragraph_tag(kinda_clean_book)
combined = format_books.combine_strings_split_on_chapter(more_clean_book) #A list of chapters.
split = format_books.split_by_section(combined)

formatted_sentence = format_books.format_sentences(split)

total_book = ' '.join(kinda_clean_book)
sentences_in_book = sent_tokenize(total_book)

joined_summary = " ".join(["".join(string) for string in reference_summary])
tokenized_summary = sent_tokenize(joined_summary)

len(tokenized_summary)

191

In [5]:
def doc2vec_EDA(ref_summary, book):
#     joined_chapter = " ".join([" ".join(string) for string in chapter])
#     tokenized_chapter = sent_tokenize(joined_chapter)

    class LabeledLineSentence(object):
        '''
        Create generator of reference summary & entire book to create vectors
        '''
        def __init__(self, summary, book):
            self.summary = summary
            self.book = book
        def __iter__(self):
            yield LabeledSentence(words= ' '.join(self.summary).split(), tags=['REF SUMMARY'])
            yield LabeledSentence(words = ' '.join(self.book).split(), tags = ['WHOLE BOOK'])
            for uid, line in enumerate(self.book):
                yield LabeledSentence(words=line.split(), tags=[int(uid)])
                

    x = LabeledLineSentence(ref_summary, book)

    model = Doc2Vec()
    model.build_vocab(x)
    model.train(x)
    similar_sentence_vectors = np.array(model.docvecs.most_similar('REF SUMMARY', topn = 229))
    return similar_sentence_vectors

len(sentences_in_book)

7119

In [6]:
similar_sentences = doc2vec_EDA(tokenized_summary, sentences_in_book)

similar_sentences

array([[  1.24200000e+03,   9.99595761e-01],
       [  6.24000000e+02,   9.99467909e-01],
       [  1.79200000e+03,   9.99463797e-01],
       [  1.20500000e+03,   9.99446869e-01],
       [  8.35000000e+02,   9.99424279e-01],
       [  1.22300000e+03,   9.99423027e-01],
       [  1.80000000e+01,   9.99353051e-01],
       [  3.44900000e+03,   9.99339104e-01],
       [  1.56500000e+03,   9.99334097e-01],
       [  2.13200000e+03,   9.99320567e-01],
       [  5.07800000e+03,   9.99292612e-01],
       [  5.22000000e+02,   9.99272108e-01],
       [  1.51000000e+02,   9.99259949e-01],
       [  3.05300000e+03,   9.99194860e-01],
       [  1.60900000e+03,   9.99192119e-01],
       [  1.08400000e+03,   9.99122381e-01],
       [  8.71000000e+02,   9.99097466e-01],
       [  4.27100000e+03,   9.99097168e-01],
       [  8.34000000e+02,   9.99074817e-01],
       [  2.68800000e+03,   9.99066889e-01],
       [  1.37400000e+03,   9.99053240e-01],
       [  5.78300000e+03,   9.99015689e-01],
       [  

In [7]:
vector_index = [int(vector[0]) for vector in similar_sentences[1:]]
sim_sentence = [sentences_in_book[index] for index in vector_index[:229]]
total_sentence = ' '.join(sim_sentence)

In [11]:
rouge_ref_summary = ' '.join(reference_summary)
rouge_score(total_sentence, rouge_ref_summary, n=2)

0.0004248088360237893

# Notes:
- Looks like the highest ROUGE score with the most optimal sentence selection would be 0.29. At random, it hovers at around 0.13 to 0.14.
- I need more books in here to get a better number.

In [12]:
len(total_sentence)

53807