In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("components/summarizer/pointer-generator")
import components.summarizer.summarizer_utils as sutils
import components.summarizer.story_converter as sconv
import pickle
import nltk.tokenize as tokenize
import os
from nltk.tokenize.moses import MosesDetokenizer

## Get some articles to summarize!

In [2]:
articles = [
    "http://www.bbc.com/news/business-43967923",
    "https://www.theguardian.com/technology/2018/may/02/tesla-loss-model-3-elon-musk",
    "https://www.theguardian.com/world/2018/may/03/japan-robot-dogs-get-solemn-buddhist-send-off-at-funerals"
]

print("Downloading articles...")
story_data = sutils.fetch_and_pickle_stories(articles, 'data/pickles/raw_stories.pickle', 'data/stories/', False)
print("Downloading articles DONE")

Downloading articles...
Downloading articles DONE


In [3]:
story_data['stories']

['Image copyright Getty Images Image caption Slash of Guns \'n\' Roses plays a Gibson guitar as James Brown watches onGibson, whose electric guitars have been played by stars including Elvis, Keith Richards and Jimmy Page, has filed for bankruptcy protection as it struggles with $500m (£367m) of debt.Lenders will take control of the Nashville-based company, which was founded in 1902.It made an ill-fated acquisition of Philips\' consumer audio division four years ago for $135m.Gibson will pull out of that business, and focus on musical instruments.Chief executive Henry Juszkiewicz, who bought the company in 1986, said the Chapter 11 filing would assure the company\'s "long-term stability and financial health".Image copyright Getty Images Image caption Keith Richards of the Rolling Stones plays a Gibson guitarUnder the bankruptcy measures, Gibson will wind down its consumer audio business, whose brands include KRK, Cerwin Vega and Stanton headphones, loudspeakers and turntables.Mandolin 

## Convert articles to the format read by the TensorFlow model:

In [4]:
sconv.process_and_save_to_disk(story_data['stories'], "test.bin", "data/converted_articles")

## Run TensorFlow model in decoder mode:

In [5]:
DATA_DIR = 'data/'
summarizer_internal_pickle = f"{DATA_DIR}pickles/decoded_stories.pickle"
data_path = f"{DATA_DIR}converted_articles/chunked/test_*"
vocab_path = f"{DATA_DIR}summarizer_training_data/finished_files/vocab"
log_root = f"{DATA_DIR}summarizer_models"
exp_name = "more_coverage"
#exp_name = "no_coverage"
#exp_name = "some_coverage"

sutils.run_summarization_model_decoder(summarizer_internal_pickle, data_path = data_path,
           vocab_path = vocab_path, log_root = log_root, exp_name = exp_name)

Starting TensorFlow Decoder...
INFO:tensorflow:Starting seq2seq_attention in asdfdsaf decode mode...
INFO:tensorflow:Current folder /Users/arturs/gpu-projects/Zentropy








max_size of vocab was specified as 50000; we now have 50000 words. Stopping reading.
Finished constructing vocabulary of 50000 total words. Last word added: farina
INFO:tensorflow:Building graph...
example_generator completed reading all datafiles. No more data.
INFO:tensorflow:The example generator for this example queue filling thread has exhausted data.
INFO:tensorflow:single_pass mode is on, so we've finished reading dataset. This thread is stopping.
INFO:tensorflow:Adding attention_decoder timestep 0 of 1
INFO:tensorflow:Time to build graph: 0 seconds
INFO:tensorflow:Loading checkpoint data/summarizer_models/more_coverage/train/model.ckpt-363378
INFO:tensorflow:Restoring parameters from data/summarizer_models/more_coverage/train/model.ckpt-363378
INFO:tensorflow:Finished reading dataset in single_pass mode.


## Look at results:

In [6]:
summarization_output = pickle.load(open(summarizer_internal_pickle, "rb" ))

In [7]:
for s in summarization_output['summaries']:
    print(s+"\n\n")


image copyright getty images image caption slash of guns 'n' roses plays a gibson guitar as james brown watches ongibson , whose electric guitars have been played by stars including elvis , keith richards and jimmy page . it made an ill-fated acquisition of philips ' consumer audio division four years ago for $ 135m . gibson will pull out of that business , and focus on musical instruments.chief executive henry juszkiewicz , who bought the company in 1986 .


analysts : elon musk got testy with analysts amid concerns over company 's future . tesla factory to be investigated over safety concerns read morebut tesla investors gave a rare rebuke to musk . tesla factory to be investigated over safety concerns read morebut tesla investors gave a rare rebuke to musk .


irreparable aibo robotic dogs are marked in much the same way as that of humans . the demise of irreparable aibo robotic dogs is marked in much the same way as that of humans . the firm stopped repairing malfunctioning aibo in

## All lower case - Named entity detector will complain!

In [8]:
tokenized_summaries = sutils.try_fix_upper_case_for_summaries(story_data['stories'], summarization_output['summaries_tokens'])

detokenizer = MosesDetokenizer()

detokenized_summaries = []

for s in tokenized_summaries:
    s_detok = detokenizer.detokenize(s, return_str=True)
    detokenized_summaries.append(s_detok)
    print(s_detok+"\n\n")

Image copyright Getty Images Image caption Slash of Guns'n' Roses plays a Gibson guitar as James Brown watches onGibson, whose electric guitars have been played by stars including Elvis, Keith Richards and Jimmy Page. it made an ill-fated acquisition of Philips' consumer audio division four years ago for $135m. Gibson will pull out of that business, and focus on musical instruments.Chief executive Henry Juszkiewicz, who bought the company in 1986.


analysts: Elon Musk got testy with analysts amid concerns over company's future. Tesla factory to be investigated over safety concerns Read moreBut Tesla investors gave a rare rebuke to Musk. Tesla factory to be investigated over safety concerns Read moreBut Tesla investors gave a rare rebuke to Musk.


irreparable Aibo robotic dogs are marked in much the same way as that of humans. the demise of irreparable Aibo robotic dogs is marked in much the same way as that of humans. the firm stopped repairing malfunctioning Aibo in 2014, leaving ow

## Much better! Next let's look at our baseline summaries:

In [9]:
print("Extractive summaries:\n")
for s1 in story_data['summaries_extractive']:
    print(s1+"\n\n")

print("3 sentence summaries:\n")
for s2 in story_data['summaries_3sent']:
    print(s2+"\n\n")

Extractive summaries:

Mandolin originsThe firm makes its electric guitars in Nashville and Memphis, while its acoustic guitars are manufactured in Bozeman, Montana.Image copyright Getty ImagesThe guitar which defined the brandPerhaps the most famous of Gibson's guitars has been the Gibson Les Paul.Jimmy Page of Led Zeppelin switched from the Fender Telecaster to the Gibson Les Paul - owning several which he had modified to help produce his distinctive sound.And while Bob Marley is not remembered primarily for his guitar skill, much of his electric work involved a modified Gibson Les Paul Junior.Noel Gallagher owns the black Gibson Les Paul which Johnny Marr played on seminal The Smiths album, The Queen is Dead.


Tesla stock was little changed after the earnings announcement but fell during a conference call with analysts, when Musk began cutting analysts’ questions short, costing Tesla over $2bn in market capitalization.Facebook Twitter Pinterest A Tesla Model 3 at an auto show in Ch

## Load NER library:

In [10]:
import components.ner.NERutils as ner

## Showtime:

In [None]:
all_orgs = []

for story in detokenized_summaries:
    storyCombined = story.replace('\n', ' ')

    print('RUNNING TOKENIZER')
    storyTokenized = tokenize.word_tokenize(storyCombined)

    print('SPLITTING SENTENCES LINE BY LINE')
    split = ner.sentenceSplitter(storyTokenized)

    inputFile = open(r'components/ner/input.txt','w')
    ner.writeArticle(split,inputFile)
    inputFile.close()

    print('RUNNING MODEL')
    os.system('python2.7 components/ner/tagger-master/tagger.py --model components/ner/tagger-master/models/english/ --input components/ner/input.txt --output components/ner/output.txt')

    with open(r'components/ner/output.txt','r') as namedStory:
        namedStory=namedStory.read().replace('\n', ' ')

    print('NAMED ENTITIES:')
    orgs  = ner.findNamedEntities(namedStory.split(' '))
    all_orgs.append(orgs)
    print(orgs)


RUNNING TOKENIZER
SPLITTING SENTENCES LINE BY LINE
RUNNING MODEL


In [None]:
all_orgs

In [None]:
detokenized_summaries