In [0]:
# Import libraries
import gc
import glob
import hashlib
import itertools
import json
import os
from os.path import join as pjoin
import random
import re
import shutil
import time
import subprocess
from collections import Counter


import torch
from multiprocess import Pool

In [3]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Set rooth path
root_path = "/content/drive/My Drive/News_Summarization_with_BERT"

In [0]:
# Set Working directory
os.chdir("/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/stanford-corenlp-full-2018-10-05")

### Sentence Splitting and Tokenization

In [0]:
def tokenize(src, dest):
  """
  Tokenizes a piece of text into its word pieces and splits sequence of tokens into sentences. 

  Keyword arguments:
  src: the source path containing input files
  dest: the destination path to save output (generated json) files
  """
  
  articles_dir = os.path.join(root_path, src)
  tokenized_articles_dir = os.path.join(root_path, dest)

  print("Preparing to tokenize %s to %s..." % (articles_dir, tokenized_articles_dir))
  articles = os.listdir(articles_dir)

  # Make list of the files contained in articles_dir; write to 'mapping_for_corenlp.txt'
  with open("mapping_for_corenlp.txt", "w") as f:
    for s in articles:
      if (not s.endswith('p')):
          continue
      f.write("%s\n" % (os.path.join(articles_dir, s)))
  print("Tokenizing %i files in %s and saving in %s..." % (len(articles), articles_dir, tokenized_articles_dir))
  # Run Standford CoreNLP command
  # This command preprocesses (tokenizes the texts and splits them into sentences) the files and writes their ouput to json files in the target directory
  ! java -mx4g -cp '*' edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit \
      -ssplit.newlineIsSentenceBreak always -filelist mapping_for_corenlp.txt -outputFormat json \
      -outputDirectory /content/drive/My\ Drive/News_Summarization_with_BERT/newsroom_data/tokenized_data
  print("Stanford CoreNLP Tokenizer has finished.")
  os.remove("mapping_for_corenlp.txt")

  # Check that the tokenized stories directory contains the same number of files as the original directory
  num_orig = len(os.listdir(articles_dir))
  num_tokenized = len(os.listdir(tokenized_articles_dir))
  if num_orig != num_tokenized:
      raise Exception(
          "The tokenized stories directory %s contains %i files, but it should contain the same number as %s (which has %i files). Was there an error during tokenization?" % (
              tokenized_articles_dir, num_tokenized, articles_dir, num_orig))
  print("Successfully finished tokenizing %s to %s.\n" % (articles_dir, tokenized_articles_dir))

In [0]:
# Get src and dest paths
src = "newsroom_data/raw_data"
dest = "newsroom_data/tokenized_data"

# Preprocess the files
tokenize(src, dest)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing file /content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/raw_data/newsroom_train_2508.p ... writing to /content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/tokenized_data/newsroom_train_2508.p.json
Annotating file /content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/raw_data/newsroom_train_2508.p ... done [0.0 sec].
Processing file /content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/raw_data/newsroom_train_2509.p ... writing to /content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/tokenized_data/newsroom_train_2509.p.json
Annotating file /content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/raw_data/newsroom_train_2509.p ... done [0.0 sec].
Processing file /content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/raw_data/newsroom_train_2510.p ... writing to /content/drive/My Drive/News_Summarization_with_BERT/newsroom_

### Format to Simpler Json Files

In [0]:
# Change working directory
os.chdir("/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data")

In [0]:
# Create mapping dictionary
REMAP = {"-lrb-": "(", "-rrb-": ")", "-lcb-": "{", "-rcb-": "}",
         "-lsb-": "[", "-rsb-": "]", "``": '"', "''": '"', "<s>": ""}

# Map regex matches to REMAP
def clean(x):
  """Replaces token transforms of special characters with their original forms"""

    return re.sub(
        r"-lrb-|-rrb-|-lcb-|-rcb-|-lsb-|-rsb-|``|''|<s>",
        lambda m: REMAP.get(m.group()), x)

# Load tokenized json files
def load_json(p, lower = True):
  """Loads a json file and returns tokenized article and summary in lower case  
  Keyword argument:
  p -- json file
  """

  source = []
  tgt = []
  flag = False
  #for each sentences
  for sent in json.load(open(p))['sentences']: 
    # Get every word_tokens
    tokens = [t['word'] for t in sent['tokens']]
    if (lower):
      # make tokens lower case
      tokens = [t.lower() for t in tokens]
    # append sentences prefixed by '<s>' to list 'tgt'
    if (tokens[0] == '<s>'):
      flag = True
      tgt.append([])
    if (flag):
      tgt[-1].extend(tokens)
    else:
      # append article sentneces to list 'source'
      source.append(tokens)

  source = [clean(' '.join(sent)).split() for sent in source]
  tgt = [clean(' '.join(sent)).split() for sent in tgt]
  return source, tgt


def _format_to_lines(params):
  """Returns dictionary containing article and summary"""

  print(params)
  source, tgt = load_json(params, lower = True)
  return {'src': source, 'tgt': tgt}


def format_to_lines(scr, dest):
  """Returns input files as condensed files containing large chunks of article-summary pair in json format.

  Keyword arguments:
  src: the source path containing input files
  dest: the destination path to which output files will be saved
  """

  tokenized_articles_dir = pjoin(root_path, src)
  target_dir = pjoin(root_path, dest)

  train_files, valid_files, test_files = [], [], []
  # Loop through file names
  for f in glob.glob(pjoin(tokenized_articles_dir, '*.json')):
    real_name = f.split('/')[-1].split('.')[0]
    # if file name contains 'valid'
    if ('valid' in real_name ):
      # append to list 'valid_files'
      valid_files.append(f)
    # if file name contains 'test'
    elif ('test' in real_name):
      # append to list 'test_files'
      test_files.append(f)
    # if file name contains 'train'
    elif ('train' in real_name):
      # append to list 'train_files'
      train_files.append(f)
    # else:
    #   train_files.append(f)

  # create a dictionary containing lists 'train_files', 'vaid_files', and 'test_files'
  corpora = {'train': train_files, 'valid': valid_files, 'test': test_files}

  # loop through the dictionary
  for corpus_type in ['train', 'valid', 'test']:
    # obtain a list of files from current list in the iteration
    a_lst = [f for f in corpora[corpus_type]]
    # parallel computing
    pool = Pool(2) #n_cpus set at 2
    dataset = []
    p_ct = 0
    # return a dictionary of article and summary for each file in 'a_list'
    for d in pool.imap_unordered(_format_to_lines, a_lst):
        dataset.append(d)
        # files, now returned as dictionaries containing article and summary, are collected in large chunks
        if (len(dataset) > 2000):
            pt_file = "{:s}.{:s}.{:d}.json".format(target_dir, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []

    pool.close()
    pool.join()
    if (len(dataset) > 0):
        pt_file = "{:s}.{:s}.{:d}.json".format(target_dir, corpus_type, p_ct)
        with open(pt_file, 'w') as save:
            # save.write('\n'.join(dataset))
            save.write(json.dumps(dataset))
            p_ct += 1
            dataset = []


In [0]:
# Define source and destination paths
src = "newsroom_data/tokenized_data"
dest = "newsroom_data/json_data/newsroom"

# Format to simpler json files
format_to_lines(src, dest)

###Format to PyTorch Files

In [0]:
# Change Working Directory
os.chdir("/content/drive/My Drive/News_Summarization_with_BERT/src")

!pip install pyrouge
!pip install pytorch_transformers 

Collecting pyrouge
[?25l  Downloading https://files.pythonhosted.org/packages/11/85/e522dd6b36880ca19dcf7f262b22365748f56edc6f455e7b6a37d0382c32/pyrouge-0.1.3.tar.gz (60kB)
[K     |█████▍                          | 10kB 28.5MB/s eta 0:00:01[K     |██████████▉                     | 20kB 35.3MB/s eta 0:00:01[K     |████████████████▎               | 30kB 40.8MB/s eta 0:00:01[K     |█████████████████████▋          | 40kB 42.3MB/s eta 0:00:01[K     |███████████████████████████     | 51kB 45.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 11.1MB/s 
[?25hBuilding wheels for collected packages: pyrouge
  Building wheel for pyrouge (setup.py) ... [?25l[?25hdone
  Created wheel for pyrouge: filename=pyrouge-0.1.3-cp36-none-any.whl size=191613 sha256=83def604f90222ba320a76a313dc72b1316e5a89c91ba4d9366be9c4d0f5a9d1
  Stored in directory: /root/.cache/pip/wheels/75/d3/0c/e5b04e15b6b87c42e980de3931d2686e14d36e045058983599
Successfully built pyrouge
Installing collecte

In [0]:
# Format to generated json files required format (PyTorch files) as model inputs
# -raw_path argument contains json files, while -save_path argument is the target directory to save the generated binary files
!python preprocess.py -mode format_to_bert -raw_path "/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/json_data" -save_path "/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/bert_data"  -lower -n_cpus 1 -log_file ../newsroom_data/logs/preprocess.log

[('train', '/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/json_data/newsroom.train.0.json', Namespace(dataset='', log_file='../newsroom_data/logs/preprocess.log', lower=True, max_src_nsents=100, max_src_ntokens_per_sent=200, max_tgt_ntokens=500, min_src_nsents=3, min_src_ntokens_per_sent=5, min_tgt_ntokens=5, mode='format_to_bert', n_cpus=1, pretrained_model='bert', raw_path='/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/json_data', save_path='/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/bert_data', select_mode='greedy', shard_size=2000, use_bert_basic_tokenizer=False), '/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/bert_data/newsroom.train.0.bert.pt'), ('train', '/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/json_data/newsroom.train.1.json', Namespace(dataset='', log_file='../newsroom_data/logs/preprocess.log', lower=True, max_src_nsents=100, max_src_ntokens_per_sent=200, max_tgt_

In [0]:
# Change directory
os.chdir("/content/drive/My Drive/News_Summarization_with_BERT/newsroom_data/bert_data")

In [0]:
#import torch
import pandas as pd

df = pd.DataFrame(torch.load("newsroom.train.20.bert.pt"))

In [8]:
# preprocessing: mission accomplished
df2 = df.head()
df2

Unnamed: 0,src,tgt,src_sent_labels,segs,clss,src_txt,tgt_txt
0,"[101, 2006, 1996, 2168, 2154, 2008, 1996, 5995...","[1, 2111, 2066, 8398, 1998, 2010, 6335, 2243, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 76, 134, 185, 220, 293, 328, 339, 372, 385...",[on the same day that the depth of donald trum...,people like trump and his ilk did not have to ...
1,"[101, 1037, 10563, 2040, 2001, 2187, 1999, 103...","[1, 2019, 2324, 29624, 29100, 29624, 11614, 20...","[1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 28, 61, 79, 103]",[a teenager who was left in a coma after a qua...,an 18-year-old who was left in a coma followin...
2,"[101, 13749, 5971, 4177, 3404, 2038, 2041, 484...","[1, 2023, 4636, 2038, 2041, 4842, 29021, 2087,...","[1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 37, 61]",[ing corporate leaders trust has outperformed ...,this fund has outperformed most of its competi...
3,"[101, 24529, 2721, 1014, 29625, 22022, 1003, 2...","[1, 26060, 9693, 2003, 24501, 24270, 4341, 199...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 47, 83, 99, 126, 172, 201, 215, 235, 247, ...",[tsla 0.34 % released a lower-priced version o...,tesla motors is resuming sales of cheaper mode...
4,"[101, 1037, 4940, 2158, 2038, 2042, 5338, 2007...","[1, 1037, 2158, 2038, 2042, 5338, 2007, 3282, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 30, 85, 139, 170, 204, 242, 268, 307, 342,...",[a melbourne man has been charged with possess...,a man has been charged with gun and drug posse...
