#Assignment 3 - Fine Tuning a Language Model
##Text Summarization and Gradio UI
Following Huggingface example - https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization.ipynb

In [1]:
! pip install datasets transformers seqeval rouge-score nltk gradio



In [24]:
# Do all the imports

import numpy as np
from datasets import load_dataset, load_metric
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import pipeline
import gradio as gr

print(transformers.__version__)

4.11.3


In [3]:
model_checkpoint = "t5-small"


In [4]:
# Load the training dataset

raw_datasets = load_dataset("xsum")
metric = load_metric("rouge")

Using custom data configuration default
Reusing dataset xsum (/root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
raw_datasets['train'][100]

{'document': 'Samsung said: "Shipments of the Galaxy Note 7 are being temporarily delayed for additional quality assurance inspections."\nThere are reports in South Korea and the US of the Galaxy Note 7 "exploding" either during or just after charging.\nHowever, it is unclear whether the delay is because of these reports.\nPictures and videos shared online depict charred and burnt handsets.\nShares fell as much as 3.5% during trade in Seoul before making a partial recovery to close 2% down on the day.\nSister company Samsung SDI told Reuters that while it was a supplier of Galaxy Note 7 batteries, it had received no information to suggest the batteries were faulty.\nA YouTube user who says they live in the US uploaded a video of a Galaxy Note 7 with burnt rubber casing and damaged screen under the name Ariel Gonzalez on 29 August.\nHe said the handset "caught fire" shortly after he unplugged the official Samsung charger, less than a fortnight after purchasing it.\n"I came home after wo

## Preprocess the Data

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [7]:
prefix = "summarize: "

In [8]:
max_input_length = 768
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/205 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-161288cdc54a7be7.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-e8282d3635ec097f.arrow


## Fine Tuning

In [10]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [16]:
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
# Function to compute metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [19]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


## Gradio UI

In [29]:
import re
def clean_text(text):
    text = text.encode("ascii", errors="ignore").decode(
        "ascii"
    )  # remove non-ascii, Chinese characters
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\n\n", " ", text)
    text = re.sub(r"\t", " ", text)
    text = text.strip(" ")
    text = re.sub(
        " +", " ", text
    ).strip()  # get rid of multiple spaces and replace with a single
    return text

In [32]:
pipeline_summ = pipeline(
    "summarization",
    model="facebook/bart-large-cnn", # switch out to "t5-small" etc if you wish
    tokenizer="facebook/bart-large-cnn", # as above
    framework="pt",
)

# First of 2 summarization function
def fb_summarizer(text):
    input_text = clean_text(text)
    results = pipeline_summ(input_text)
    return results[0]["summary_text"]

# First of 2 Gradio apps that we'll put in "parallel"
summary1 = gr.Interface(
    fn=fb_summarizer,
    inputs=gr.inputs.Textbox(),
    outputs=gr.outputs.Textbox(label="Summary"),
)

loading configuration file https://huggingface.co/facebook/bart-large-cnn/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/199ab6c0f28e763098fd3ea09fd68a0928bb297d0f76b9f3375e8a1d652748f9.930264180d256e6fe8e4ba6a728dd80e969493c23d4caa0a6f943614c52d34ab
Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "

In [None]:
summary1.launch(debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
This share link will expire in 72 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted
Running on External URL: https://10632.gradio.app
Interface loading below...
