In [13]:
import os
cache = os.path.join(os.path.abspath(os.getcwd()), 'datasets')
os.environ['HF_DATASETS_CACHE'] = cache

from datasets import load_dataset
import datasets

OneSentData = load_dataset("xsum")
OneSentData["test"].shuffle

Found cached dataset xsum (c:/Users/19259/Documents/Python/Simplify/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)
100%|██████████| 3/3 [00:00<00:00, 62.59it/s]


<bound method Dataset.shuffle of Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 11334
})>

In [14]:
def samples(dataset, samples=3, seed=0):
    sample = dataset["train"].shuffle(seed=seed).select(range(samples))
    for i in sample:
        for l in dataset["train"].column_names:
            print(f">> {l}: {i[l]}")
        print("\n")
samples(OneSentData)

Loading cached shuffled indices for dataset at c:\Users\19259\Documents\Python\Simplify\datasets\xsum\default\1.2.0\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71\cache-193b3f554bf14a6b.arrow


>> document: The first involved a motorcycle and a BMW at about 12:10 BST between Merthyr and Nant Ddu.
The second happened 20 minutes later near Treforest, Rhondda Cynon Taff, involving a motorcycle and another vehicle.
Diversions are in place in both areas with drivers advised to expect delays. Anyone with information is asked to call South Wales Police on 101.
>> summary: Police are investigating two serious crashes on the A470 in south Wales.
>> id: 33751856


>> document: The Bank of England said the change to the Financial Services Compensation Scheme (FSCS) would happen by 30 January 2017.
The compensation limit was lowered to £75,000 in July 2015, following sterling's rise against the euro.
But since the Brexit vote, sterling has fallen more than 10% against the euro.
The amount of compensation payable is set at €100,000 across the European Union, and under normal circumstances is re-calculated in sterling every five years.
But the European directive involved allows a more freq

In [15]:
def filter_size(example):
    return (
        len(example["summary"]) >= 10
    )
OneSentData.filter(filter_size)

OneSentData["train"] = OneSentData["train"].shuffle(seed=0).select(range(len(OneSentData["train"])//10))
OneSentData["validation"] = OneSentData["validation"].shuffle(seed=0).select(range(len(OneSentData["validation"])//10))
OneSentData["test"] = OneSentData["test"].shuffle(seed=0).select(range(len(OneSentData["test"])//10))
OneSentData

Loading cached processed dataset at c:\Users\19259\Documents\Python\Simplify\datasets\xsum\default\1.2.0\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71\cache-aa02ff01c7986985.arrow
Loading cached processed dataset at c:\Users\19259\Documents\Python\Simplify\datasets\xsum\default\1.2.0\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71\cache-02dabe45a731619a.arrow
Loading cached processed dataset at c:\Users\19259\Documents\Python\Simplify\datasets\xsum\default\1.2.0\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71\cache-ac6761acabe119fb.arrow
Loading cached shuffled indices for dataset at c:\Users\19259\Documents\Python\Simplify\datasets\xsum\default\1.2.0\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71\cache-193b3f554bf14a6b.arrow
Loading cached shuffled indices for dataset at c:\Users\19259\Documents\Python\Simplify\datasets\xsum\default\1.2.0\082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71\cache-281476

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 40809
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 2266
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 2266
    })
})

In [16]:
cache = os.path.join(os.path.abspath(os.getcwd()), 'models')
os.environ['TRANSFORMERS_CACHE'] = cache

from transformers import AutoTokenizer

model_checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, cache_dir= cache)


In [17]:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs

{'input_ids': [0, 100, 2638, 2600, 5, 27689, 3100, 328, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['<s>', 'I', 'Ġloved', 'Ġreading', 'Ġthe', 'ĠHunger', 'ĠGames', '!', '</s>']

In [31]:
max_input_length = 512
max_target_length = 256


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["document"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["summary"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tOneSent = OneSentData.map(preprocess_function, batched=True)

                                                                   

In [32]:
import evaluate

rouge_score = evaluate.load("rouge")

In [33]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["document"]]
    return metric.compute(predictions=summaries, references=dataset["summary"])



In [34]:
import pandas as pd
import nltk
nltk.download('punkt')

score = evaluate_baseline(OneSentData["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\19259\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'rouge1': 0.1860323869039647,
 'rouge2': 0.026147468479358518,
 'rougeL': 0.12072295495837032,
 'rougeLsum': 0.14590201775179126}

In [35]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [36]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 4
# Show the training loss with every epoch
logging_steps = len(tOneSent["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"models/{model_name}-finetuned-test1",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=0,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False,
)

In [37]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result

In [38]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [39]:
tokenized_datasets = tOneSent.remove_columns(
    OneSentData["train"].column_names
)

In [40]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

{'input_ids': tensor([[    0,   133,    78,   963,    10, 10218,     8,    10,  8588,    23,
            59,   316,    35,   698, 28964,   227,  4213,   212,  4503,     8,
           234,   927,   211,  6588,     4, 50118,   133,   200,  1102,   291,
           728,   423,   583,  6213, 28236,     6,  8778,  2832,  6106, 43042,
           261,   255,  3707,     6,  3329,    10, 10218,     8,   277,  1155,
             4, 50118,   495, 10744,  2485,    32,    11,   317,    11,   258,
           911,    19,  2377,  5578,     7,  1057,  6091,     4,  6142,    19,
           335,    16,   553,     7,   486,   391,  5295,   522,    15,  6560,
             4,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

In [41]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [42]:
trainer.train()

  1%|          | 402/40816 [02:32<4:00:01,  2.81it/s]

KeyboardInterrupt: 

In [4]:
from transformers import pipeline
trained_model = "models\\bart-base-finetuned-test1\checkpoint-10000"
summarizer = pipeline("summarization", model=trained_model)


In [6]:
summarizer("""The Bank of England said the change to the Financial Services Compensation Scheme (FSCS) would happen by 30 January 2017.
The compensation limit was lowered to £75,000 in July 2015, following sterling's rise against the euro.
But since the Brexit vote, sterling has fallen more than 10% against the euro.
The amount of compensation payable is set at €100,000 across the European Union, and under normal circumstances is re-calculated in sterling every five years.
But the European directive involved allows a more frequent assessment where there are "unforeseen events, such as currency fluctuations".
The Bank of England's move suggests that it believes the fall in the value of sterling is at least semi-permanent.
The FSCS covers current and savings accounts, and cash ISAs.
The new maximum compensation - of £85,000 per person per bank - was previously in place for five years, up to July 2015.
However Andrew Tyrie, the chairman of the Treasury Select Committee, said the level had been altered seven times in the last ten years.
"The absurd situation, in which the UK is left vulnerable, at the discretion of the European Commission, to frequent changes in our deposit scheme, must be brought to an end," he said.
"Brexit should give the UK the opportunity to set its own level of protection. We should take it."
The Prudential Regulation Authority (PRA), part of the Bank of England, will now consult on the plan, although the idea has already been approved by the Treasury and the European Commission.
The consultation will close in less than a month, on 16 December.
Banks and building societies will have until next summer to change the conditions on their publicity material.""")

[{'summary_text': 'The maximum compensation payable to banks and building societies for a fall in the value of sterling is to be reduced.'}]