# Mount

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
! ls /content/gdrive/MyDrive/__projects

# repo

In [1]:
! git clone https://github.com/hieutt99/multidoc-summ

Cloning into 'multidoc-summ'...
remote: Enumerating objects: 1624, done.[K
remote: Counting objects: 100% (415/415), done.[K
remote: Compressing objects: 100% (266/266), done.[K
remote: Total 1624 (delta 327), reused 229 (delta 148), pack-reused 1209[K
Receiving objects: 100% (1624/1624), 1.27 MiB | 15.45 MiB/s, done.
Resolving deltas: 100% (1228/1228), done.


In [2]:
! cp -r ./multidoc-summ/long_transformer ./long_transformer

In [3]:
%cd long_transformer

/content/long_transformer


In [4]:
%%capture 
! pip install tensorboardX
! pip install pyrouge
! pip install transformers
! pip install ruamel.yaml
! pip install pyyaml==5.4.1
! pip install pytorch_transformers

In [5]:
# https://drive.google.com/file/d/12kFikfAfP1MOWR3brSqthWwzFft4qpuo/view?usp=sharing
! gdown --id 12kFikfAfP1MOWR3brSqthWwzFft4qpuo

Downloading...
From: https://drive.google.com/uc?id=12kFikfAfP1MOWR3brSqthWwzFft4qpuo
To: /content/long_transformer/led_abs_model.pt
100% 2.12G/2.12G [00:35<00:00, 59.3MB/s]


In [6]:
! mv led_abs_model.pt ./saved_models/led_abs_model
! ls ./saved_models/led_abs_model

./saved_models/led_abs_model


# Setup

In [7]:
! pip install jsonlines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0


In [8]:
! pip install stanza

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.4.0-py3-none-any.whl (574 kB)
[K     |████████████████████████████████| 574 kB 15.1 MB/s 
Collecting emoji
  Downloading emoji-2.0.0.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 72.5 MB/s 
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.0.0-py3-none-any.whl size=193022 sha256=73d596a1539ae3d861e8c8dec2410d2284364fcf5b2028df459bc3826f5e7bba
  Stored in directory: /root/.cache/pip/wheels/ec/29/4d/3cfe7452ac7d8d83b1930f8a6205c3c9649b24e80f9029fc38
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-2.0.0 stanza-1.4.0


In [9]:
%%capture
! pip install tqdm

In [10]:
import os, sys
import json
from tqdm.auto import tqdm 
import re
import glob
import shutil

In [11]:
from copy import deepcopy
import jsonlines

import gc 
gc.collect()
import matplotlib.pyplot as plt
%matplotlib inline

# Utils
- read_txt
- write_txt 
- write_json 
- read_json
- chunk_data

In [12]:
def read_txt(path, read_line=True):
    data = []
    with open(path, 'r', encoding='utf-8') as fp:
        if read_line:
            data = fp.readlines()
        else:
            data = fp.read()
    return data
def write_txt(data, path):
    with open(path, 'w', encoding='utf-8') as fp:
        fp.write(data)
    return True

In [13]:
def write_json(data, path):
    with open(path, 'w', encoding='utf-8') as fp:
        json.dump(data, fp)

def read_json(path):
    with open(path, 'r', encoding='utf-8') as fp:
        data = json.load(fp)
    return data

In [14]:
def chunk_data(l, n):
    for i in range(0, len(l), n):
        yield (l[i:i+n])

## setup stanza stanford core nlp for text tokenize

In [15]:
import stanza 
corenlp_dir = '/content/corenlp'
stanza.install_corenlp(dir=corenlp_dir)

2022-08-08 09:06:10 INFO: Installing CoreNLP package into /content/corenlp


Downloading https://huggingface.co/stanfordnlp/CoreNLP/resolve/main/stanford-corenlp-latest.zip:   0%|        …



In [16]:
os.environ["CORENLP_HOME"] = corenlp_dir

In [17]:
# client.stop()

In [18]:
from stanza.server import CoreNLPClient

client = CoreNLPClient(
    annotators=['tokenize','ssplit'], 
    memory='4G', 
    endpoint='http://localhost:9001',
    max_char_length=500000,
    be_quiet=True)
print(client)

# # Start the background server and wait for some time
# # Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
# import time; time.sleep(10)

2022-08-08 09:06:35 INFO: Writing properties to tmp file: corenlp_server-02d6582822ba442a.props
2022-08-08 09:06:35 INFO: Starting server with command: java -Xmx4G -cp /content/corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-02d6582822ba442a.props -annotators tokenize,ssplit -preload -outputFormat serialized


<stanza.server.client.CoreNLPClient object at 0x7f87c973ad90>


In [19]:
# client.start()

In [20]:
# client.stop()

In [21]:
def scnlp_tokenize(text):
    document = client.annotate(text)
    tokenized = [[token.originalText for token in sent.token] for sent in document.sentence]
    return tokenized

## PRESET

In [22]:
modes = ['train', 'valid', 'test']

## functions handle raw data

In [23]:
def fully_tokenize(doc):
    '''
    input single src doc or summary, '\n' replaced with NEWLINE_CHAR
    output sent tokenized and word tokenized data  
    '''
    split_newline = doc.split('NEWLINE_CHAR')
    split_newline = [re.sub(' +', ' ', item) for item in split_newline]
    split_newline = [i.strip(' ') for i in split_newline if i not in ['', ' ']]
    tokenized = []
    for item in split_newline:
        tokenized.extend(scnlp_tokenize(item))
    return tokenized

In [24]:
def tokenize_cluster(src_docs, tgt_doc):
    '''
    tokenize to json
    '''
    src_docs = src_docs.split('|||||')
    src_docs = [fully_tokenize(src_doc) for src_doc in src_docs]
    src_docs = [src_doc for src_doc in src_docs if len(src_doc)>0]
    tgt_doc = fully_tokenize(tgt_doc)
    return src_docs, tgt_doc

In [25]:
def create_txt_from_tokenized(tokenized, multi_doc=True):
    if multi_doc:
        joined_stories = []
        for story in tokenized:
            joined_story = []
            for sent in story:
                joined_story.append(' '.join(sent))
            joined_story = '\n'.join(joined_story)
            joined_stories.append(joined_story)
        joined_stories = '\n|||||\n'.join(joined_stories)
        return joined_stories
    else:
        joined_story = []
        for sent in tokenized:
            joined_story.append(' '.join(sent))
        return '\n'.join(joined_story)

# tokenize data

In [26]:
! gdown --id 186j6r1tk6_-SiaqVyZbwtpIKFZYXzl-S
! unzip multi_news_raw.zip
! mv raw multi_news_raw

Downloading...
From: https://drive.google.com/uc?id=186j6r1tk6_-SiaqVyZbwtpIKFZYXzl-S
To: /content/long_transformer/multi_news_raw.zip
100% 255M/255M [00:03<00:00, 72.1MB/s]
Archive:  multi_news_raw.zip
   creating: raw/
  inflating: raw/README.md           
  inflating: raw/test.src            
  inflating: raw/test.tgt            
  inflating: raw/train.src           
  inflating: raw/train.tgt           
  inflating: raw/val.src             
  inflating: raw/val.tgt             


In [27]:
! mkdir multi_news_tokenized

In [28]:
MULTI_NEWS_RAW_FOLDER = 'multi_news_raw'
MULTI_NEWS_TOKENIZED = 'multi_news_tokenized'

In [186]:
test_id = 4000

In [187]:
mode = "test" # mode val la valid
src_file = f'{mode}.src'
tgt_file = f'{mode}.tgt'
src_path = os.path.join(MULTI_NEWS_RAW_FOLDER, src_file)
tgt_path = os.path.join(MULTI_NEWS_RAW_FOLDER, tgt_file)
src_data = read_txt(src_path)
tgt_data = read_txt(tgt_path)

In [188]:
src_docs = src_data[test_id]
tgt_doc = tgt_data[test_id]

In [189]:
print(src_docs)

Andrew Gillum addresses his supporters after winning the Democrat primary for governor on Tuesday, Aug. 28, 2018, in Tallahassee, Fla. (AP Photo/Steve Cannon) (Associated Press) NEWLINE_CHAR  NEWLINE_CHAR Andrew Gillum addresses his supporters after winning the Democrat primary for governor on Tuesday, Aug. 28, 2018, in Tallahassee, Fla. (AP Photo/Steve Cannon) (Associated Press) NEWLINE_CHAR  NEWLINE_CHAR TALLAHASSEE, Florida (AP) — A liberal Florida Democrat pulled off an upset victory while President Donald Trump's favored candidate cruised to an easy win Tuesday, setting up a fierce showdown for the governor's mansion in the nation's largest political battleground. NEWLINE_CHAR  NEWLINE_CHAR Tallahassee Mayor Andrew Gillum, an unabashed progressive, won the Democratic primary, moving him a step away from becoming the state's first black governor. He'll face off against Trump-backed Republican Rep. Ron DeSantis. NEWLINE_CHAR  NEWLINE_CHAR DeSantis gave Trump credit for his victory, 

In [190]:
src_docs, tgt_doc = tokenize_cluster(src_docs, tgt_doc)

In [191]:
print(src_docs)

[[['Andrew', 'Gillum', 'addresses', 'his', 'supporters', 'after', 'winning', 'the', 'Democrat', 'primary', 'for', 'governor', 'on', 'Tuesday', ',', 'Aug.', '28', ',', '2018', ',', 'in', 'Tallahassee', ',', 'Fla.', '(', 'AP', 'Photo', '/', 'Steve', 'Cannon', ')', '(', 'Associated', 'Press', ')'], ['Andrew', 'Gillum', 'addresses', 'his', 'supporters', 'after', 'winning', 'the', 'Democrat', 'primary', 'for', 'governor', 'on', 'Tuesday', ',', 'Aug.', '28', ',', '2018', ',', 'in', 'Tallahassee', ',', 'Fla.', '(', 'AP', 'Photo', '/', 'Steve', 'Cannon', ')', '(', 'Associated', 'Press', ')'], ['TALLAHASSEE', ',', 'Florida', '(', 'AP', ')', '—', 'A', 'liberal', 'Florida', 'Democrat', 'pulled', 'off', 'an', 'upset', 'victory', 'while', 'President', 'Donald', 'Trump', "'s", 'favored', 'candidate', 'cruised', 'to', 'an', 'easy', 'win', 'Tuesday', ',', 'setting', 'up', 'a', 'fierce', 'showdown', 'for', 'the', 'governor', "'s", 'mansion', 'in', 'the', 'nation', "'s", 'largest', 'political', 'battleg

# clean

In [35]:
class DataConfig():
    min_src_nsents = 5
    max_src_nsents = 200

    min_src_ntokens = 4
    max_src_ntokens = 200  

    min_tgt_ntokens = 5
    max_tgt_ntokens = 500

In [36]:
import string 
def clean_by_ntokens(doc):
    '''
    input: word tokenized level doc 
    output: cleaned by ntokens or trunc ntokens
    '''
    # remove short sent and trunc long sent 
    cleaned = []
    total = 0
    threshold = 512
    for sent in doc:
        if len(sent) > DataConfig.max_src_ntokens:
            new_sent = [token for token in sent if token not in [item for item in list(string.punctuation) if item not in ['.', ',']]][:DataConfig.max_src_ntokens]
            if len(new_sent) > DataConfig.min_src_ntokens:
                cleaned.append(new_sent)
                total += len(new_sent)
        elif len(sent) >= DataConfig.min_src_ntokens:
            cleaned.append(sent)
            total += len(sent)

        if threshold and total > threshold:
            break

    return cleaned

def clean_by_nsents(docs):
    '''
    input: word tokenized level doc 
    output: cleaned by nsents trunc nsents
    '''
    return [doc[:DataConfig.max_src_nsents+1] for doc in docs if len(doc) >= DataConfig.min_src_nsents]

In [192]:
src_docs = [[sent for sent in doc if len(sent)>DataConfig.min_src_ntokens] for doc in src_docs]
src_docs = clean_by_nsents(src_docs)
src_docs = [clean_by_ntokens(doc) for doc in src_docs]
src_docs = [doc for doc in src_docs if len(doc)>DataConfig.min_src_nsents]

# transformers tokenizer

In [38]:
from transformers import LEDTokenizer

In [39]:
tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

In [93]:
class SpecialTokens():
    sep_token = '</s>'
    cls_token = '<s>'
    pad_token = '<pad>'
    bos_token = '<s>' # 0
    eos_token = '</s>' # 1
    additional_special_tokens = ['<ss>', '<ds>']

    sep_vid = tokenizer.sep_token_id
    cls_vid = tokenizer.cls_token_id
    pad_vid = tokenizer.pad_token_id

In [94]:
special_tokens_dict = {'additional_special_tokens': SpecialTokens.additional_special_tokens}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(num_added_toks)

0


In [95]:
def bert_tokenize(doc, special_tokens_included=True):
    subtoken_idxs = tokenizer.encode(doc, add_special_tokens=False)
    subtokens = tokenizer.decode(subtoken_idxs, clean_up_tokenization_spaces=False, skip_special_tokens=False)
    return subtokens, subtoken_idxs 

In [152]:
def process_src_docs(src_docs):
    src_txt = [' '.join(sent) for doc in src_docs for sent in doc]

    # src_txt_with_special_tokens = [' '.join(sent) for doc in src_docs for sent in doc]

    src_txt_with_special_tokens = [[' '.join(sent) for sent in doc] for doc in src_docs]
    for i in range(len(src_txt_with_special_tokens)):
        src_txt_with_special_tokens[i][-1] = src_txt_with_special_tokens[i][-1] + f' {SpecialTokens.additional_special_tokens[1]}'
    src_txt_with_special_tokens = [sent for doc in src_txt_with_special_tokens for sent in doc]

    # src_txt_with_special_tokens = [f'{SpecialTokens.cls_token} ' + sent + f' {SpecialTokens.sep_token}' for sent in src_txt_with_special_tokens]

    for index, item in enumerate(src_txt_with_special_tokens):
        if index == 0:
            src_txt_with_special_tokens[index] = f'{SpecialTokens.cls_token} ' + src_txt_with_special_tokens[index]
        if index == (len(src_txt_with_special_tokens) - 1):
            src_txt_with_special_tokens[index] = src_txt_with_special_tokens[index] + f' {SpecialTokens.eos_token}'
        else:
            src_txt_with_special_tokens[index] = src_txt_with_special_tokens[index] + f' {SpecialTokens.additional_special_tokens[0]}'
    
    docs_lens = [len(doc) for doc in src_docs]

    src_subtokens, src_subtoken_idxs = bert_tokenize(' '.join(src_txt_with_special_tokens))
    cls_ids = [i for i, t in enumerate(src_subtoken_idxs) if t == SpecialTokens.cls_vid]
    glob_mask = [1 if t == SpecialTokens.cls_vid else 0 for i, t in enumerate(src_subtoken_idxs)]
    _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == SpecialTokens.sep_vid]
    segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
    segments_ids = []
    for i, s in enumerate(segs):
        if (i % 2 == 0):
            segments_ids += s * [0]
        else:
            segments_ids += s * [1]

    return src_subtoken_idxs, segments_ids, glob_mask, cls_ids, src_txt

def process_tgt_doc(tgt_doc):
    tgt_txt = '<q>'.join([' '.join(sent) for sent in tgt_doc])

    tgt_doc = f'{SpecialTokens.bos_token} '+\
                                   f' {SpecialTokens.additional_special_tokens[0]} '.join([' '.join(sent) for sent in tgt_doc]) + f' {SpecialTokens.eos_token}'

    tgt_subtokens, tgt_subtoken_idxs = bert_tokenize(tgt_doc)
    docs_ids = [0 for i in range(len(tgt_subtoken_idxs))]
    return tgt_subtoken_idxs, tgt_txt, docs_ids

In [206]:
src_subtoken_idxs, segments_ids, glob_mask, cls_ids, src_txt = process_src_docs(src_docs)

In [207]:
tgt_subtoken_idxs, tgt_txt, tgt_docs_ids = process_tgt_doc(tgt_doc)

In [208]:
print(src_subtoken_idxs)

[0, 19843, 6452, 783, 8480, 39, 2732, 71, 1298, 5, 3661, 2270, 13, 2318, 15, 294, 2156, 2049, 4, 971, 2156, 199, 2156, 11, 12388, 895, 281, 7048, 2156, 7178, 4, 36, 1480, 1333, 1589, 2206, 19908, 4839, 36, 1562, 977, 4839, 50265, 19843, 6452, 783, 8480, 39, 2732, 71, 1298, 5, 3661, 2270, 13, 2318, 15, 294, 2156, 2049, 4, 971, 2156, 199, 2156, 11, 12388, 895, 281, 7048, 2156, 7178, 4, 36, 1480, 1333, 1589, 2206, 19908, 4839, 36, 1562, 977, 4839, 50265, 565, 7981, 15334, 2336, 32739, 2156, 1261, 36, 1480, 4839, 93, 83, 6176, 1261, 3661, 2468, 160, 41, 4904, 1124, 150, 270, 807, 140, 128, 29, 15411, 1984, 11806, 1720, 7, 41, 1365, 339, 294, 2156, 2749, 62, 10, 11039, 13100, 13, 5, 2318, 128, 29, 15906, 11, 5, 1226, 128, 29, 1154, 559, 29783, 479, 50265, 565, 18897, 281, 7048, 3129, 2224, 6452, 783, 2156, 41, 29491, 9512, 8212, 2156, 351, 5, 1557, 2270, 2156, 1375, 123, 10, 1149, 409, 31, 1959, 5, 194, 128, 29, 78, 909, 2318, 479, 50265, 894, 128, 890, 652, 160, 136, 140, 111, 4094, 1172, 

In [209]:
print(tokenizer.decode(src_subtoken_idxs))

<s>Andrew Gillum addresses his supporters after winning the Democrat primary for governor on Tuesday, Aug. 28, 2018, in Tallahassee, Fla. ( AP Photo / Steve Cannon ) ( Associated Press ) <ss> Andrew Gillum addresses his supporters after winning the Democrat primary for governor on Tuesday, Aug. 28, 2018, in Tallahassee, Fla. ( AP Photo / Steve Cannon ) ( Associated Press ) <ss> TALLAHASSEE, Florida ( AP ) — A liberal Florida Democrat pulled off an upset victory while President Donald Trump's favored candidate cruised to an easy win Tuesday, setting up a fierce showdown for the governor's mansion in the nation's largest political battleground. <ss> Tallahassee Mayor Andrew Gillum, an unabashed progressive, won the Democratic primary, moving him a step away from becoming the state's first black governor. <ss> He 'll face off against Trump - backed Republican Rep. Ron DeSantis. <ss> DeSantis gave Trump credit for his victory, saying that with one supportive tweet, the president " kind of 

In [219]:
max_tgt_len = 1024
max_pos = 4000

In [220]:
import bisect
src = src_subtoken_idxs
tgt_end_id = tgt_subtoken_idxs[-1]
tgt = tgt_subtoken_idxs[:max_tgt_len][:-1] + [tgt_end_id]
segs = segments_ids
clss = cls_ids

end_id = [src[-1]]
src = src[:-1][:max_pos - 1] + end_id
segs = segs[:max_pos]
max_sent_id = bisect.bisect_left(clss, max_pos)
clss = clss[:max_sent_id]
glob_mask = glob_mask[:max_pos]

src_sent_labels = ([1]*len(cls_ids))[:max_sent_id]

# Model

In [110]:
import argparse
import glob
import os
import random
import signal
import time

import torch
from tokenizer.utils import build_tokenizer
# from tokenizer.special_tokens import SpecialTokens

import distributed
from dataloader import dataloader
from models.model_utils import build_model
from trainer import train_builder
from dataloader.dataloader import load_dataset
from trainer.loss import abs_loss
from trainer.predictor import build_predictor
from trainer.trainer import build_trainer
from others.logging import logger, init_logger
from utils.arguments import ModelConfig

In [111]:
from utils.arguments import load_config 

run_config = load_config("./demo.yaml")

run_config.gpu_ranks = [int(i) for i in range(len(run_config.visible_gpus.split(',')))]
run_config.world_size = len(run_config.gpu_ranks)
os.environ["CUDA_VISIBLE_DEVICES"] = run_config.visible_gpus

init_logger(run_config.log_file)
device = "cpu" if run_config.visible_gpus == '-1' else "cuda"
device_id = 0 if device == "cuda" else -1

In [112]:
device = "cpu" if run_config.visible_gpus == '-1' else "cuda"
pt = run_config.test_from
if (pt != ''):
    test_from = pt
else:
    test_from = run_config.test_from
logger.info('Loading checkpoint from %s' % test_from)

checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
opt = checkpoint['opt']
run_config.model_config = ModelConfig(**opt)
print(run_config.model_config)

tokenizer = build_tokenizer(run_config)
run_config.model_config.vocab_size = tokenizer.vocab_size
vocab = tokenizer.get_vocab()
symbols = {'BOS': vocab[SpecialTokens.bos_token], 'EOS': vocab[SpecialTokens.eos_token],
            'PAD': vocab[SpecialTokens.pad_token], 'EOQ':vocab[SpecialTokens.additional_special_tokens[0]]}

model = build_model(run_config.model_config, device, checkpoint, tokenizer)
model.eval()

predictor = build_predictor(run_config, tokenizer, symbols, model, logger)


[2022-08-08 09:22:34,680 INFO] Loading checkpoint from ./saved_models/led_abs_model


ModelConfig(model_name='led_abs', vocab_size=50265, freeze_bert=False, d_model=768, num_heads=8, dropout=0.1, norm_first=True, num_encoder_blocks=2, num_decoder_blocks=2, layer_norm_eps=1e-06, d_ff=2048, dec_layers=4, dec_hidden_size=768, dec_heads=8, dec_ff_size=2048, dec_dropout=0.2, max_position_embeddings=4000, bert_model='allenai/led-base-16384', type_doc_size=2)


Some weights of the model checkpoint at allenai/led-base-16384 were not used when initializing LEDModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing LEDModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LEDModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [221]:
from dataloader.dataloader import Batch 
batch = Batch([(src, tgt, segs, glob_mask, clss, src_sent_labels, src_txt, tgt_txt)], device=device, is_test=True)

In [222]:
print(batch.tgt_str)

['– Andrew Gillum pulled off an upset victory in Florida on Tuesday to become the state \'s first black nominee for governor , the AP reports .<q>The Bernie Sanders - backed liberal Democrat , currently the mayor of Tallahassee , will run against President Trump - endorsed Republican Rep. Ron DeSantis , Fox News reports , results that the AP says " immediately transformed the Florida race into one of the most closely watched gubernatorial campaigns in the country . "<q>The Huffington Post says the November election will be " a referendum " on the president .<q>Trump congratulated DeSantis on Twitter after his win .<q>More from the primaries in Florida and Arizona , both closely watched political battleground states , plus a runoff election in Oklahoma on Tuesday : Florida Senate race : Current Florida Gov. Rick Scott , who is term - limited in that role , won the GOP nomination and will run against incumbent Democratic Sen. Bill Nelson , who ran unopposed in the Democratic primary and 

In [223]:
save_pred, save_gold, save_src, c, av, av_ = predictor._handle(batch)

In [224]:
print(save_pred)

– A liberal Florida Democrat pulled off an upset victory while President Trump's favored candidate cruised to an easy win Tuesday, setting up a fierce showdown for the governor's mansion in the nation's largest political battleground.<q>Tallahassee Mayor Andrew Gillum, an unabashed progressive, won the Democratic primary, moving him a step away from becoming the state's first black governor.<q>He 'll face off against Trump - backed Republican Rep. Ron DeSantis.<q>The results immediately transformed the Florida race into one of the most closely watched gubernatorial campaigns in the country.<q>Gillum's primary victory could help Democrats boost enthusiasm among minorities who often don't vote in large numbers in years when a presidential candidate isn't on the ballot.<q>Gillum is his party's third black gubernatorial nominee this year, along with Stacey Abrams in Georgia and Ben Jealous in Maryland.<q>Gillum has become the first African - American nominee for Florida governor, reports t

In [225]:
print(save_gold)

– Andrew Gillum pulled off an upset victory in Florida on Tuesday to become the state 's first black nominee for governor , the AP reports .<q>The Bernie Sanders - backed liberal Democrat , currently the mayor of Tallahassee , will run against President Trump - endorsed Republican Rep. Ron DeSantis , Fox News reports , results that the AP says " immediately transformed the Florida race into one of the most closely watched gubernatorial campaigns in the country . "<q>The Huffington Post says the November election will be " a referendum " on the president .<q>Trump congratulated DeSantis on Twitter after his win .<q>More from the primaries in Florida and Arizona , both closely watched political battleground states , plus a runoff election in Oklahoma on Tuesday : Florida Senate race : Current Florida Gov. Rick Scott , who is term - limited in that role , won the GOP nomination and will run against incumbent Democratic Sen. Bill Nelson , who ran unopposed in the Democratic primary and is 

In [226]:
for sent in save_pred.split("<q>"):
    print(sent)

– A liberal Florida Democrat pulled off an upset victory while President Trump's favored candidate cruised to an easy win Tuesday, setting up a fierce showdown for the governor's mansion in the nation's largest political battleground.
Tallahassee Mayor Andrew Gillum, an unabashed progressive, won the Democratic primary, moving him a step away from becoming the state's first black governor.
He 'll face off against Trump - backed Republican Rep. Ron DeSantis.
The results immediately transformed the Florida race into one of the most closely watched gubernatorial campaigns in the country.
Gillum's primary victory could help Democrats boost enthusiasm among minorities who often don't vote in large numbers in years when a presidential candidate isn't on the ballot.
Gillum is his party's third black gubernatorial nominee this year, along with Stacey Abrams in Georgia and Ben Jealous in Maryland.
Gillum has become the first African - American nominee for Florida governor, reports the AP.
In Ar

In [227]:
for sent in save_gold.split("<q>"):
    print(sent)

– Andrew Gillum pulled off an upset victory in Florida on Tuesday to become the state 's first black nominee for governor , the AP reports .
The Bernie Sanders - backed liberal Democrat , currently the mayor of Tallahassee , will run against President Trump - endorsed Republican Rep. Ron DeSantis , Fox News reports , results that the AP says " immediately transformed the Florida race into one of the most closely watched gubernatorial campaigns in the country . "
The Huffington Post says the November election will be " a referendum " on the president .
Trump congratulated DeSantis on Twitter after his win .
More from the primaries in Florida and Arizona , both closely watched political battleground states , plus a runoff election in Oklahoma on Tuesday : Florida Senate race : Current Florida Gov. Rick Scott , who is term - limited in that role , won the GOP nomination and will run against incumbent Democratic Sen. Bill Nelson , who ran unopposed in the Democratic primary and is seen as 