In [6]:
%load_ext autoreload
%autoreload 2

import summarizer_utils as sutils
import story_converter as sconv
import pickle
import nltk.tokenize as tokenize
import os
from nltk.tokenize.moses import MosesDetokenizer
import pandas as pd
import re

DATA_PATH = '../../data/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Get the test data:

In [2]:
article_df = pd.read_csv(DATA_PATH + 'test_data.csv')

def remove_date_title_and_id(s):
    _s = s.split('\n')
    return ''.join(_s[3:])


#replace period followed by a capital with a period space capital
def fix_periods(s):
    fixed = re.sub('\.([A-Z])', '. \g<1>', s)
    return fixed

articles = article_df['full_text'].apply(remove_date_title_and_id)
articles = articles.apply(fix_periods)

#date_and_id_regex = '^[0-9]{8}[\n][A-Z0-9]{32}[\n]'
ids = article_df['docno_x']


def load_test_data(csv_file):
    article_df = pd.read_csv(csv_file)
    
    articles = article_df['full_text'].apply(remove_date_title_and_id)
    articles = articles.apply(fix_periods)
    
    ids = article_df['docno_x']
    
    return articles, ids

articles, ids = load_test_data(DATA_PATH + 'test_data.csv')

print(articles[5])
print(ids[5])

08:45 31 January 2017Ravender SembhyAround 400 staff are expected to transfer following the sale by Royal Dutch Shell of a package of assets in the North Sea to Chrysaor in a deal valued at up to £3bn. Photo: Yui Mok/PA WireRoyal Dutch Shell is to sell off a package of North Sea assets for up to 3.8bn US dollars (£3bn) to smaller rival Chrysaor as it continues its divestment drive. The oil giant’ s interests in Buzzard, Beryl, Bressay, Elgin-Franklin, J-Block, the Greater Armada cluster, Everest, Lomond and Erskine, plus a 10% stake in Schiehallion, will all be offloaded as part of the deal. Around 400 staff are expected to transfer to Chrysaor on their existing terms and conditions of employment. Shell will pocket an initial 3bn US dollars, followed by a payment of up to 600m US dollars between 2018 and 2021, subject to commodity prices, with potential further payments of up to 180m US dollars for future discoveries. The company said it would record an “accounting gain” on the sale of

In [3]:
sconv.tokenize_stories(articles[:26], verbose = True)

Processing story number: 1
Processing story number: 26


["Net management fees fell by 9pc over the course of the year to $ 691m while performance fees dived by almost two-thirds to $ 112m . As a result the firm 's shares dropped by 10pc in early trading , but recovered later in the day to a more modest 1.8 pc loss . In part that was due to a rise in overall assets under management which increased by 3pc to $ 80.9 bn . The company prefers to focus on adjusted pre-tax profits , which came in at $ 205m , down from 2015 's profit of $ 400m . Chief executive Luke Ellis , who took the reins in the middle of 2016 , said Man Group has `` made real progress in repositioning the firm for the future '' and `` continued to control our cost base '' . `` Looking forward to 2017 , we have started the year with a good pipeline of interest from clients and encouraging performance across most of our strategies as the new global political environment has created many alpha opportunities , but it remains early days in an uncertain market , '' he said .",
 "Tue

## Convert articles to the format read by the TensorFlow model:

In [4]:
sconv.process_and_save_to_disk(articles, "test.bin", verbose = True)

Processing story number: 1
Processing story number: 26
Processing story number: 51
Processing story number: 76
Processing story number: 101
Processing story number: 126
Processing story number: 151
Processing story number: 176
Processing story number: 201
Processing story number: 226
Processing story number: 251
Processing story number: 276
Processing story number: 301
Processing story number: 326
Processing story number: 351
Processing story number: 376
Processing story number: 401
Processing story number: 426
Processing story number: 451
Processing story number: 476
Processing story number: 501
Processing story number: 526
Processing story number: 551
Processing story number: 576
Processing story number: 601
Processing story number: 626
Processing story number: 651
Processing story number: 676
Processing story number: 701
Processing story number: 726
Processing story number: 751
Processing story number: 776
Processing story number: 801
Processing story number: 826
Processing story nu

## Run TensorFlow model in decoder mode:

In [7]:
summarizer_internal_pickle = DATA_PATH + "pickles/decoded_stories_more_coverage.pickle"
#sutils.run_summarization_model_decoder(summarizer_internal_pickle, "pretrained_model_tf1.2.1_original")
#sutils.run_summarization_model_decoder(summarizer_internal_pickle, "coverage_trained")
sutils.run_summarization_model_decoder(summarizer_internal_pickle, 
           data_path = DATA_PATH + "converted_articles/chunked/test_*" ,
           vocab_path = DATA_PATH + "summarizer_training_data/finished_files/vocab",
           log_root = DATA_PATH + "summarizer_models",
           exp_name = "more_coverage")

Starting TensorFlow Decoder...
INFO:tensorflow:Starting seq2seq_attention in asdfdsaf decode mode...
INFO:tensorflow:Current folder /Users/arturs/gpu-projects/Zentropy/components/summarizer








max_size of vocab was specified as 50000; we now have 50000 words. Stopping reading.
Finished constructing vocabulary of 50000 total words. Last word added: farina
INFO:tensorflow:Building graph...
INFO:tensorflow:Adding attention_decoder timestep 0 of 1
INFO:tensorflow:Time to build graph: 1 seconds
INFO:tensorflow:Loading checkpoint ../../data/summarizer_models/more_coverage/train/model.ckpt-363378
INFO:tensorflow:Restoring parameters from ../../data/summarizer_models/more_coverage/train/model.ckpt-363378
25 articles summarized
50 articles summarized


KeyboardInterrupt: 

## Look at results:

In [7]:
summarization_output = pickle.load(open(summarizer_internal_pickle, "rb" ))

In [8]:
for s in summarization_output['summaries']:
    print(s+"\n\n")


net management fees fell by 9pc over the year to $ 691m while performance fees dived by almost two-thirds to $ 112m . chief executive luke ellis , who took the reins in the middle of 2016 , said man group has `` made real progress in repositioning the firm for the future '' chief executive luke ellis , who took the reins in 2016 , said man group has `` made real progress in repositioning the firm for the future ''


british american tobacco took the plastic film off another industry consolidation effort . the ftse 100 company said the deal has been unanimously approved by the transaction committee of independent reynolds established to evaluate the bat offer . reynolds shareholders will receive $ 29.44 in cash and 0.5260 bat ordinary shares for each reynolds share .


glencore plc rode a wave of surging commodity prices in 2016 to return to a profit of $ 1.4 billion . chief executive ivan glasenberg now faces a tough decision : start splurging on new mergers or acquisitions , or return

## All lower case - Named entity detector will complain!

In [9]:
tokenized_summaries = sutils.try_fix_upper_case_for_summaries(articles[:10], summarization_output['summaries_tokens'])

detokenizer = MosesDetokenizer()

detokenized_summaries = []

for s in tokenized_summaries:
    s_detok = detokenizer.detokenize(s, return_str=True)
    detokenized_summaries.append(s_detok)
    print(s_detok+"\n\n")

Net management fees fell by 9pc over the year to $691m while performance fees dived by almost two-thirds to $112m. Chief executive Luke Ellis, who took the reins in the middle of 2016, said Man Group has ``made real progress in repositioning the firm for the future ''Chief executive Luke Ellis, who took the reins in 2016, said Man Group has`` made real progress in repositioning the firm for the future''


British American tobacco took the plastic film off another industry consolidation effort. the FTSE 100 company said the deal has been unanimously approved by the transaction committee of independent Reynolds established to evaluate the BAT offer. Reynolds shareholders will receive $29.44 in cash and 0.5260 BAT ordinary shares for each Reynolds share.


Glencore PLC rode a wave of surging commodity prices in 2016 to return to a profit of $1.4 billion. Chief executive Ivan Glasenberg now faces a tough decision: start splurging on new mergers or acquisitions, or return the rewards to sha

In [1]:
len(detokenized_summaries)

NameError: name 'detokenized_summaries' is not defined

## Much better! Next let's look at our baseline summaries:

In [8]:
summaries_3sent = sutils.get_3_sentence_summaries(articles)


## Send data to named entity extractor:

In [32]:
summarizer_output_pickle = "../../data/pickles/summarizer_output.pickle"

summarizer_output = {
    'ids' : ids,
    'stories': articles,
     #'summaries': detokenized_summaries,
    'summaries_3sent': summaries_3sent
}

pickle.dump(summarizer_output, open(summarizer_output_pickle, "wb"))

In [37]:
len(summarizer_output['stories'])

3986

## Load NER library:

In [16]:
import sys
sys.path.append("../ner")

import NERutils as ner


In [15]:
foo_bar = 3

## Showtime:

In [23]:
def extract_orgs(stories):
    inputFile = open('../ner/input.txt','w')
    print('RUNNING TOKENIZER')
    print('SPLITTING SENTENCES LINE BY LINE')
    for story in stories:
        storyCombined = story.replace('\n', ' ')
        storyTokenized = tokenize.word_tokenize(storyCombined)
        split = ner.sentenceSplitter(storyTokenized)
        ner.writeArticle(split,inputFile)

    inputFile.close()

    print('RUNNING MODEL')
    os.system('python2.7 ../ner/tagger-master/tagger.py --model ../ner/tagger-master/models/english/ --input ../ner/input.txt --output ../ner/output.txt')

    with open(r'../ner/output.txt','r') as namedStory:
        namedStory=namedStory.read().replace('\n', ' ')

    # Get named entities:
    orgs  = ner.findNamedEntities(namedStory.split(' '))
    
    return(stories)

ner_articles = extract_orgs(summarizer_output['stories'])
ner_summaries = extract_orgs(summarizer_woutput['summaries'])
ner_summaries_3sent = extract_orgs(summarizer_output['summaries_3sent'])

RUNNING TOKENIZER
SPLITTING SENTENCES LINE BY LINE


IndexError: list index out of range

In [29]:
stories = summarizer_output['stories']
inputFile = open('../ner/input.txt','w')
print('RUNNING TOKENIZER')
print('SPLITTING SENTENCES LINE BY LINE')
for story in stories:
    storyCombined = story.replace('\n', ' ')
    storyTokenized = tokenize.word_tokenize(storyCombined)
    split = ner.sentenceSplitter(storyTokenized)
    ner.writeArticle(split,inputFile)

inputFile.close()

print('RUNNING MODEL')
os.system('python2.7 ../ner/tagger-master/tagger.py --model ../ner/tagger-master/models/english/ --input ../ner/input.txt --output ../ner/output.txt')

with open(r'../ner/output.txt','r') as namedStory:
    namedStory=namedStory.read().replace('\n', ' ')

# Get named entities:
orgs  = ner.findNamedEntities(namedStory.split(' '))

RUNNING TOKENIZER
SPLITTING SENTENCES LINE BY LINE


AttributeError: 'list' object has no attribute 'split'

In [28]:
summarizer_output['stories'][0]

'Net management fees fell by 9pc over the course of the year to $691m while performance fees dived by almost two-thirds to $112m. As a result the firm’s shares dropped by 10pc in early trading, but recovered later in the day to a more modest 1.8pc loss. In part that was due to a rise in overall assets under management which increased by 3pc to $80.9bn. The company prefers to focus on adjusted pre-tax profits, which came in at $205m, down from 2015’s profit of $400m. Chief executive Luke Ellis, who took the reins in the middle of 2016, said Man Group has “made real progress in repositioning the firm for the future” and “continued to control our cost base”.“Looking forward to 2017, we have started the year with a good pipeline of interest from clients and encouraging performance across most of our strategies as the new global political environment has created many alpha opportunities, but it remains early days in an uncertain market,” he said.'