# Set-up & downloading modules

In [1]:
! pip install torch
! pip install datasets transformers rouge-score nltk
! pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3.7 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3.7 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [3]:
# run 'huggingface-cli login' in terminal

In [2]:
! sudo apt-get install software-properties-common
! sudo curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh
! sudo apt-get install git-lfs
! git lfs install




software-properties-common is already the newest version (0.96.20.2-2).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
#!/bin/bash

unknown_os ()
{
  echo "Unfortunately, your operating system distribution and version are not supported by this script."
  echo
  echo "You can override the OS detection by setting os= and dist= prior to running this script."
  echo "You can find a list of supported OSes and distributions on our website: https://packagecloud.io/docs#os_distro_version"
  echo
  echo "For example, to force Ubuntu Trusty: os=ubuntu dist=trusty ./script.sh"
  echo
  echo "Please email support@packagecloud.io and let us know if you run into any issues."
  exit 1
}

gpg_check ()
{
  echo "Checking for gpg..."
  if command -v gpg > /dev/null; then
    echo "Detected gpg..."
  else
    echo "Installing gnupg for GPG verification..."
    apt-get install -y gnupg
    if [ "$?" -ne "0" ]; then
      echo "Unable to install GPG! Your base system has a problem; pleas

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/coder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Fine-tuning a model on a summarization task

In [4]:
model_checkpoint = "google/mt5-small"

## Loading the dataset

In [5]:
import datasets
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

metric = datasets.load_metric("rouge")

#load through pandas and turn into Dataset format
df = pd.read_csv(r'/work/danewsroom/danewsroom.csv', nrows = 50000)
df = df.rename(columns={'Unnamed: 0': 'idx'})
df_small = df[['text', 'summary', 'idx']]
data = Dataset.from_pandas(df_small)

#test train split
train_d, test_d = data.train_test_split(test_size=0.2).values()
#and validation
train_d, val_d = train_d.train_test_split(test_size=0.25).values()

#make the datasetdict
dd = datasets.DatasetDict({"train":train_d,"validation":val_d,"test":test_d})
dd

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'idx'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['text', 'summary', 'idx'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['text', 'summary', 'idx'],
        num_rows: 10000
    })
})

## Preprocessing the data

In [6]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
if model_checkpoint in ["google/mtf5-small"]:
    prefix = "summarize: "
else:
    prefix = ""

In [8]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_datasets = dd.map(preprocess_function, batched=True)

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

## Fine-tuning the model

In [10]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [11]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-summariser",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    #fp16=True,
    #push_to_hub=True,
)

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    metrics={k: round(v, 4) for k, v in result.items()}
    np.save('mt5_metrics.npy', metrics) 
    return metrics

In [14]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: idx, text, summary.
***** Running training *****
  Num examples = 30000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1875


In [25]:
#trainer.push_to_hub()
#! git push

fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [1]:
test_data = dd['test']

# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
test_data = test_data.select(range(16))

batch_size = 16  # change to 64 for full evaluation

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cpu")
    attention_mask = inputs.attention_mask.to("cpu")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["text"])

pred_str = results["pred"]
label_str = results["summary"]

rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

np.save('mt5_results.npy', results)
np.save('mt5_rouge.npy', rouge_output)


NameError: name 'dd' is not defined