In [1]:
#transformers 4.6.1
#pip freeze | cut -d'=' -f1 | xargs -n1 pip install -U
#nvidia-smi
#torch.cuda.is_available()
#torch.cuda.empty_cache()
import nltk
#nltk.download('punkt')
import numpy as np
#import pandas as pd
import torch
#import argparse
import mlflow
#import azureml.core
#import logging
#import sys
#import time
import os
from datasets import load_from_disk, load_metric
from transformers.integrations import MLflowCallback, AzureMLCallback
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    set_seed,
    #AutoConfig,
    #HfArgumentParser
)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

In [None]:
# download model (cache)
#model_name = "google/pegasus-xsum"
#model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir = "./cache")
#tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = "./cache")

# save model
#model.save_pretrained("../model/pegasus-xsum")
#tokenizer.save_pretrained("../model/pegasus-xsum")

In [2]:
model_path = "../model/pegasus-xsum/"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [3]:
data = load_from_disk("../data/xsum")
test_data = data["test"].select(range(1000, 1004))
test_data[0]

{'document': 'The Welsh Economy Research report showed 79% of direct spend was retained in Wales, and associations built nearly 2,000 affordable homes.\nThis was an increase of 4% on the previous year.\nThe annual report, commissioned by Community Housing Cymru, looked at the impact of social housing in Wales.\n£1.1bn\ncontributed to the economy in 2014/15\n£872m of that was retained in Wales\n1,923 new homes built in 2014/15\n£301m on repairs/maintenance in 2014/15\n£532m on regeneration in 2014/15',
 'id': '34846955',
 'summary': 'Welsh housing associations directly contributed more than £1bn to the economy in 2014/15, an independent report has said.'}

In [17]:
# inference pipeline
from transformers import SummarizationPipeline

summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
summarizer(test_data["document"], min_length=5, max_length=64)

[{'summary_text': 'Social housing contributed £1.1bn to the Welsh economy in 2014/15, according to a new report.'},
 {'summary_text': 'The Foreign Office is "urgently" working with the authorities in Thailand to establish whether a British national has died.'},
 {'summary_text': 'Plans for a new school campus in the Scottish Borders have moved a step closer.'},
 {'summary_text': 'A woman has been taken to hospital following a one-vehicle crash in Aberdeenshire.'}]

In [13]:
# inference wip
#from transformers import PegasusTokenizer, PegasusForConditionalGeneration

#device = "cuda" if torch.cuda.is_available() else "cpu"
#model_name = "google/pegasus-xsum"
#model = PegasusForConditionalGeneration.from_pretrained(model_name, cache_dir = "./cache").to(device)
#tokenizer = PegasusTokenizer.from_pretrained(model_name, cache_dir = "./cache")

# pegasus-xsum content hallucination
src_text = (
"It's no secret that NVIDIA is on the verge of releasing the GeForce RTX 3070 Ti and the GeForce RTX 3080 Ti. "
"Despite the continued supply issues with both non-Ti versions of these cards, NVIDIA’s performance train must keep moving. "
"However, it looks as though MSI jumped the gun a bit earlier today when it updated its website to include a product category "
"for its upcoming GeForce RTX 3080 Ti graphics card family."
)

inputs = tokenizer([src_text], max_length=512, truncation=True, return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'])
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

["The MSI GeForce 3080 Ti graphics card is now available to pre-order from the company's website."]


In [9]:
batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors='pt').to(device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
print(tgt_text[0])

The MSI GeForce 3080 Ti graphics card is now available to pre-order from the company's website.


In [12]:
inputs = tokenizer.encode(src_text, max_length=512, truncation=True, return_tensors='pt')
outputs = model.generate(inputs, max_length=64, min_length=10, length_penalty=0.8, num_beams=8, early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The MSI GeForce 3080 Ti graphics card is now available to pre-order from the company's website.


In [14]:
# inference comparison
def summarize(dataset, model):
    inputs = tokenizer(
        dataset["document"],
        truncation=True,
        max_length = 512, #max_source_length #encoder_max_length
        padding=True,
        return_tensors = "pt"
    )
    input_ids = inputs.input_ids.to(model.device) # docs
    attention_mask = inputs.attention_mask.to(model.device) # docs
    outputs = model.generate(input_ids, attention_mask=attention_mask) # docs
    summary_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, summary_texts

#base_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

#base_summaries = summarize(test_data, base_model)[1]
finetuned_summaries = summarize(test_data, model)[1]

In [15]:
# visualize summaries for manual comparison/evaluation
# td format: df tables
i = 1
print("Source:\n" + test_data["document"][i] + "\n")
print("Target:\n" + test_data["summary"][i] + "\n")
print("Fine-tuned:\n" + finetuned_summaries[i] + "\n")

Source:
Local reports from the southern resort island of Phuket say a British man died after turning a pistol on himself.
The Foreign Office (FO) could not confirm the reports.
An FO spokesman said: "We are urgently working with the authorities in Thailand to establish whether a British national has died in Phuket."
The Bangkok Post, quoted a taxi driver who said the man had hailed him near a local resort and asked to be taken somewhere where he could shoot. He had shown no signs of stress while in the taxi, the driver said.

Target:
The Foreign Office says it is urgently investigating reports that a British man has died at a shooting range in Thailand.

Fine-tuned:
The Foreign Office is "urgently" working with the authorities in Thailand to establish whether a British national has died.

