# Summarisation notebook

This is the notebook for summarisation

(More descriptions)

In [1]:
# ENSURE RUNTIME IS GPU (if on COLAB)
# This assumes that you have cloned your git repo and loaded it into your Google Drive to be used
# from google.colab import drive
# drive.mount('/content/drive')

# change to working dir
%cd drive/MyDrive/ALTA2021_tutorial/summarisation/

/content/drive/MyDrive/ALTA2021_tutorial/summarisation


In [2]:
# !echo $PYTHONPATH
# %env PYTHONPATH="/env/python:/content/drive/MyDrive/ALTA2021_tutorial"
# !echo $PYTHONPATH

In [3]:
# Clone repo (if not done)
# !git clone https://github.com/ijauregiCMCRC/ALTA2021_tutorial

# Install requirements
# !pip install -r requirements.txt

# Download models (Load BART model from huggingface, but load Longformer from pretrained ckpt due to config issue)
# BART
# !wget https://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz -P ../pretrained_lms/
# !tar -xzvf ../pretrained_lms/bart.base.tar.gz # extract
# !mv bart.base/ ../pretrained_lms/facebook-bart-base/  # move to model location
# !rm -rf ../pretrained_lms/bart.base.tar.gz  # remove tar.gz file to save memory

### 1. Import packages

In [4]:
import os
os.getcwd()
import random
import numpy as np
import textwrap  # for inference example

import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TestTubeLogger
from pytorch_lightning.callbacks import ModelCheckpoint
import nlp

from src.summarisation_lightning_model import LmForSummarisation

### 2. Define parameters

In [5]:
args ={
    'max_input_len': 8192,  # Maximum number of tokens in the source documents, 512 for BART-base, 2048 for LED-base
    'max_output_len': 256,  # Maximum number of tokens in the summary
    'save_dir': '../models/summarisation_led',  # Path to save the model and logs, 'models/summarisation_bart' for BART, 'models/summarisation_led' for LED
    'tokenizer': 'facebook/bart-base',  # Pretrained tokenizer
    'model_path': 'allenai/led-base-16384',  # Pretrained model (facebook/bart-base for BART, allenai/led-base-16384)
    'label_smoothing': 0.0, # Label smoothing (not required)
    'epochs': 1,  # Number of epochs during training
    'batch_size': 1,  # Batch size (1 for LED, 4 for BART)
    'grad_accum': 1,  # Gradient accumulation (4 for LED for effective batch size, 1 for BART to keep consistent)
    'lr': 0.00003,  # Training learning rate
    'warmup': 1000,  # Number of warmup steps
    'gpus': 1,  # Number of gpus. 0 for CPU
    'precision': 32,  # Double precision (64), full precision (32) 
                      # or half precision (16). Can be used on CPU, GPU or TPUs.
    'cache_dir': '../datasets/cache/', # Path to dataset cache where dataset is converted
    'attention_dropout': 0.1,  # default
    'adafactor': True,  # use Adafactor optimizer, else Adam
    'debug': False,  # debug run
    'num_workers': 0,  # number of data loader workers
    'grad_ckpt': True,  # gradient checkpointing to save memory
    'attention_mode': 'sliding_chunks',  # Longformer attention mode
    'attention_window': 512  # Longformer attention window
}

### 3. Initialize Lightning module

In [6]:
# Initialize with a seed
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    
# dataset size. Needed to compute number of steps for the lr scheduler
args['dataset_size'] = 50594 # manually entered

# Define PyTorch Lightning model
model = LmForSummarisation(args)
# Include datasets
model.hf_datasets = nlp.load_dataset('multi_news', cache_dir=args['cache_dir'])

# Define logger
logger = TestTubeLogger(
    save_dir=args['save_dir'],
    name='training',
    version=0  # always use version=0
)

# Define checkpoint saver
checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(args['save_dir'], "training", "checkpoints"),  # Dir path
    save_top_k=1,  # Maximum number of checkpoints to be saved
    verbose=True,  # Verbose
    monitor='avg_val_loss',  # Checkpointing measurement (BLEU validation)
    mode='min',      # Maximize measurement over the validation
    period=1         # Save every epoch
)

print(args)


# Define lightning trainer
trainer = pl.Trainer(gpus=args['gpus'], distributed_backend='dp' if torch.cuda.is_available() else None,
                     track_grad_norm=-1,
                     max_epochs=args['epochs'],
                     max_steps=None,
                     replace_sampler_ddp=False,
                     accumulate_grad_batches=args['grad_accum'],
                     gradient_clip_val=1.0,  # Max grad_norm
                     val_check_interval=1.0,  # Num steps between validation
                     num_sanity_val_steps=2,  # Validation steps for sanity check
                     check_val_every_n_epoch=1,  # Check validation every N
                     logger=logger,
                     callbacks=checkpoint_callback,
                     progress_bar_refresh_rate=10,  # Progress bar for printing (updates every N)
                     precision=args['precision'],
                     amp_backend='native', amp_level='O2'
                     )

TypeError: ignored

#### 4. Train model

In [5]:
# Train model
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                        | Params
------------------------------------------------------
0 | model | LEDForConditionalGeneration | 161 M 
------------------------------------------------------
161 M     Trainable params
0         Non-trainable params
161 M     Total params
647.378   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"Your {mode}_dataloader has `shuffle=True`, it is best practice to turn"


{'vloss': tensor(3.1148, device='cuda:0'), 'rouge1': tensor(0.1216, device='cuda:0'), 'rouge2': tensor(0.0311, device='cuda:0'), 'rougeL': tensor(0.0811, device='cuda:0'), 'rougeLsum': tensor(0.0811, device='cuda:0')}


Training: -1it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


### 5. Test model

In [None]:
# Test model
trainer.test(model)

### 6. Inference

In [5]:
# Define PyTorch Lightning model
model = LmForSummarisation.load_from_checkpoint('../models/<path_to_model>.ckpt')

# Example from Multi-News (3 documents separated by |||||)
document = 'If True, Building Set For Demolition Could Be Manhattan\'s Oldest October 15, 2013 5:39 PM      Preservationist Adam Woodward discovered a cellar that he believes could be the foundation of the Revolutionary War-era Bull’s Head Tavern. (credit: Adam Woodward)      NEW YORK (CBSNewYork) — A preservationist says he has found evidence that a Manhattan building is the former site of an 18th-century tavern where George Washington is believed to have enjoyed a celebratory drink during the American Revolution.      If it is indeed the home of the legendary watering hole, the discovery could mean that the building that is perhaps Manhattan’s oldest is slated to demolished.      “After the English had marched up the Bowery and out of the city (in 1783), George Washington and Governor (George) Clinton stopped at the Bull’s Head (tavern),” preservationist Adam Woodward told WCBS 880’s Alex Silverman.      play pause Preservationist Believes He\'s Found Tavern Where George Washington Visited      WCBS 880\'s Alex Silverman...                  The building at 50 Bowery, which has had many faces since, is being prepared for demolition so a hotel can be built at the site. Legend had it that “the Bull’s Head’s structure, cellar, bones” were still inside, Woodward said.      He decided to poke around and, in the basement, Woodward found what he believes are Colonial-era, hand-hewn and hand-planed joists and foundation walls.      “Found myself in what I am pretty certain is the 1750s historic tavern,” he said.      Woodward said he felt compelled to investigate in the building, which once housed a chain drugstore and the Atlantic Garden beer garden, because time was running out.      “I just realized that it would be the last chance to solve one of the great mysteries of New York City history,” he said.      “It was pretty incredible walking back in time 250 years.”      Historian and author David Freeland told Silverman that the find “would make it very likely the oldest building remaining in Manhattan.”      That has Woodward hoping city officials will act quickly to preserve the site.      “What an incredible opportunity that the city suddenly has for this thing to re-emerge,” he said.      You May Also Be Interested In These Stories ||||| Photo      Maybe George Washington slept there, or maybe he only watered his horse and ordered stronger stuff for himself. Either way, David Freeland sounded excited as he crossed the threshold where a famous Colonial-era tavern, the Bull’s Head, once welcomed thirsty out-of-towners.      “There are treasures inside,” said Mr. Freeland, an author and a historian who researched the site for a book about a beer garden that later occupied the tavern’s place on the Bowery.      Photo      But all he saw was debris from the building’s most recent life, as a chain drugstore with a Chinese restaurant upstairs. He did not reach the treasures that thrilled local-history aficionados over the weekend — namely, some old-looking joists and foundation walls in the basement — because the steps were blocked by rubble. The site is to be cleared for a hotel.      The joists were discovered by a photographer and preservationist, Adam Woodward, who suspects that structural elements of the Colonial-era tavern were used in the construction of the much larger beer hall, the Atlantic Garden. It reigned as “one of the show places of New York” from 1858 on, The New York Times said when it finally shut down in 1911.      But what about the tavern where Washington established his temporary headquarters in November 1783 as the British withdrew?      “The whole issue of whether the Bull’s Head was buried inside the Atlantic Garden was one of the great mysteries of New York,” Mr. Woodward said.      Until, apparently, the other day, when he got a look inside. He saw iron work from the 19th century and I-beams from later on. And then he saw a stairway to the basement, and headed down.      “At one point there was a distinct change in the building material, from cinder block to a brick-and-stone foundation wall,” he said. “I followed that wall and found myself at the front of the building, under the sidewalk at the Bowery, and looked up and saw what looked to me like 18th-century hand-hewn and hand-planed joists and beams with extremely wide floorboards right above them.”      He said, “I was thinking, I am standing in the cellar of the Bull’s Head.”      The Bull’s Head opened around 1750 on the fringe of what was a still-young city concentrated below the Bowery. Washington and his troops marched down the Bowery and stopped there in 1783 before making “their official entrance into the city proper,” said Kerri Culhane, a historian who wrote the application that won the Bowery a place on the National Register of Historic Places.      The neighborhood “was a butchers’ district in the 18th century and the 19th century,” Ms. Culhane said. “People drove livestock down from the hinterland and the slaughterhouse was behind the Bowery. That’s where the trading took place.”      It was also a home to the ancestors of future V.I.P.’s. “The Astors started out as butchers,” she said, but began snapping up land. They even owned the Bull’s Head site.      But the tavern closed. Mr. Freeland wrote that the building became a store that sold stoves until the Atlantic Garden opened as a beer garden.      It was a popular gathering place for German immigrants in its early days, and in the 1870s and 1880s, the Atlantic Garden was raided repeatedly for selling beer on Sundays, when the city’s excise laws appeared to forbid that. Mr. Freeland noted that the laws did not mention beer, only “intoxicating liquors or wines.” The Atlantic Garden’s owner got off after one raid because the judge sampled the beer the police had seized and complained it was so watered down that “a man might drink by the gallon without getting drunk.”      Later still, the Atlantic Garden became “a place where Tin Pan Alley songwriters would go to plug their songs,” Mr. Freeland said. One tune that apparently got its start there in the 1890s was “Daisy Bell,” the song that turned the phrase “bicycle built for two” into a catchphrase.      Mr. Woodward said he hoped the demolition for the hotel could be delayed long enough for “a proper archaeological exploration.” (Calls to the owner were not returned on Monday.)      “I can’t think of another lot in Manhattan that has a more important history,” Mr. Woodward said, “and the fact that it might be intact, a couple of feet under the building, is an incredible opportunity to get on archaeological record.”             ||||| Elected officials and the Landmarks Preservation Commission are both doing their best to launch a thorough investigation of what may very well be the famed 18th century saloon the Bull\'s Head Tavern, but their options are limited. Photographer Adam Woodward first documented the ancient, hand-planed wood joists and stone foundation in the basement of 50 Bowery last week, and both he and historian David Freeland are convinced that these are the remnants of Bull\'s Head. If so, this would be the oldest surviving structure in Manhattan by far, and as Woodward puts it, an "incredible opportunity to get on archaeological record." But the fate of the site depends on the current owner, Alex Chu, who is demolishing the site to make way for a new hotel.      The Landmarks Commission tells The Lo-Down that they\'re "aware of the situation," but "cannot require the owner to conduct archaeology." The best they can do is give the owner a list of good archaeologists. Some elected officials are also getting involved, but again they\'ve got to take it up with Chu first.      · Landmarks Commission: No Jurisdiction to Mandate Historic Site Survey at 50 Bowery [TLD]      · All Coverage of Bull\'s Head Tavern [~ENY~] |||||'
print(textwrap.fill(document, 100))

If True, Building Set For Demolition Could Be Manhattan's Oldest October 15, 2013 5:39 PM
Preservationist Adam Woodward discovered a cellar that he believes could be the foundation of the
Revolutionary War-era Bull’s Head Tavern. (credit: Adam Woodward)      NEW YORK (CBSNewYork) — A
preservationist says he has found evidence that a Manhattan building is the former site of an 18th-
century tavern where George Washington is believed to have enjoyed a celebratory drink during the
American Revolution.      If it is indeed the home of the legendary watering hole, the discovery
could mean that the building that is perhaps Manhattan’s oldest is slated to demolished.      “After
the English had marched up the Bowery and out of the city (in 1783), George Washington and Governor
(George) Clinton stopped at the Bull’s Head (tavern),” preservationist Adam Woodward told WCBS 880’s
Alex Silverman.      play pause Preservationist Believes He's Found Tavern Where George Washington
Visited      WCBS 8

In [6]:
summary = model.summarise_example(document)
print(textwrap.fill(summary[0], 100))

  doc_attention_mask = torch.nn.utils.rnn.pad_sequence([torch.tensor(attention_mask)],


If True, Building Set For Demolition Could Be Manhattan's Oldest October 15, 2013 5:39 PM   Â
Preservationist Adam Woodward discovered a cellar that he believes could be the foundation of the
Revolutionary War-era Bull’s Head Tavern. (credit: Adam Woodward) ,   NEW YORK (CBSNewYork) — A
preservationist says he has found evidence that a Manhattan building is the former site of an 18th-
century tavern where George Washington is believed to have enjoyed a celebratory drink during the
American Revolution.  ,   If it is indeed the home of the legendary watering hole, the discovery
could mean that the building that is perhaps Manhattan’’ oldest is slated to demolished. ’   “After
the English had marched up the Bowery and out of the city (in 1783), George Washington and Governor
(George) Clinton stopped at the Bull”s Head (tavern),” preservationist Adam Wright told WCBS 880’S
Alex Silverman.  …   … play slideshow 1 of 11  ”   [The Bull‘s Head’ is the home to George
Washington,�
