## Setup

In [1]:
!pip install datasets -qq
!pip install transformers -qq
!pip install rouge_score evaluate nltk -qq

In [2]:
import torch
import numpy as np
import datasets
import nltk
import evaluate

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from collections import Counter
import matplotlib.pyplot as plt

## Load dataset

### GLUE/QQP dataset

In [3]:
# load full Quora Question Pair dataset
train_qqp, dev_qqp, test_qqp = datasets.load_dataset("glue", "qqp",split=['train', 'validation', 'test'])

  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
filtered_train = train_qqp.filter(lambda example: example["label"]==1)
filtered_valid = dev_qqp.filter(lambda example: example["label"]==1)
# filtered_test = test_dataset.filter(lambda example: example["label"]==1)

  0%|          | 0/364 [00:00<?, ?ba/s]

  0%|          | 0/41 [00:00<?, ?ba/s]

In [5]:
print("Number of paraphrased pairs in train_set:", len(filtered_train))
print("Number of paraphrased pairs in valid_set:", len(filtered_valid))
print()
print(f"Proportion of paraphrased pairs in train_set: {len(filtered_train)/len(train_qqp)*100:.2f}%")
print(f"Proportion of paraphrased pairs in valid_set: {len(filtered_valid)/len(dev_qqp)*100:.2f}%")
print()
print("Train_set length:", len(train_qqp))
print("Valid_set length:", len(dev_qqp))

Number of paraphrased pairs in train_set: 134378
Number of paraphrased pairs in valid_set: 14885

Proportion of paraphrased pairs in train_set: 36.93%
Proportion of paraphrased pairs in valid_set: 36.82%

Train_set length: 363846
Valid_set length: 40430


In [6]:
qqp = datasets.DatasetDict({
    'train': filtered_train,
    'validation': filtered_valid
})

In [7]:
qqp

DatasetDict({
    train: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 134378
    })
    validation: Dataset({
        features: ['question1', 'question2', 'label', 'idx'],
        num_rows: 14885
    })
})

In [8]:
qqp["validation"][0]

{'question1': 'Is there a reason why we should travel alone?',
 'question2': 'What are some reasons to travel alone?',
 'label': 1,
 'idx': 2}

In [9]:
qqp["validation"].features

{'question1': Value(dtype='string', id=None),
 'question2': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['not_duplicate', 'duplicate'], id=None),
 'idx': Value(dtype='int32', id=None)}

### PAWS dataset

In [10]:
paws = datasets.load_dataset("paws", 'labeled_final')

  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
print(paws)
print(paws["train"].features.type)

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
})
struct<id: int32, sentence1: string, sentence2: string, label: int64>


In [12]:
train_paws = paws["train"].filter(lambda x: x["label"] == 1)
valid_paws = paws["validation"].filter(lambda x: x["label"] == 1)

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [13]:
paws_train_len = len(paws["train"])
paws_valid_len = len(paws["validation"])
train_paws[0]

{'id': 2,
 'sentence1': 'The NBA season of 1975 -- 76 was the 30th season of the National Basketball Association .',
 'sentence2': 'The 1975 -- 76 season of the National Basketball Association was the 30th season of the NBA .',
 'label': 1}

In [14]:
print("Number of paraphrased pairs in PAWS train_set:", len(train_paws))
print("Number of paraphrased pairs in PAWS valid_set:", len(valid_paws))
print()
print(f"Proportion of paraphrased pairs in PAWS train_set: {len(train_paws)/paws_train_len * 100:.2f}%")
print(f"Proportion of paraphrased pairs in PAWS valid_set: {len(valid_paws)/paws_valid_len * 100:.2f}%")
print()
print("PAWS Train_set length:", len(paws["train"]))
print("PAWS Valid_set length:", len(paws["validation"]))

Number of paraphrased pairs in PAWS train_set: 21829
Number of paraphrased pairs in PAWS valid_set: 3539

Proportion of paraphrased pairs in PAWS train_set: 44.19%
Proportion of paraphrased pairs in PAWS valid_set: 44.24%

PAWS Train_set length: 49401
PAWS Valid_set length: 8000


In [15]:
filtered_paws = datasets.DatasetDict({
    'train': train_paws,
    'validation': valid_paws 
})
filtered_paws

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 21829
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 3539
    })
})

## Load pretrained BART and tokenizer

In [16]:
model_name = 'facebook/bart-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
print(model.config)

BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 1024,
  "model_ty

In [18]:
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

In [19]:
print(f"BART base size: {np.round(model.num_parameters()/1e6, 1)} M parameters")

BART base size: 139.4 M parameters


## Tokenization

In [20]:
def batch_tokenize_preprocess(batch, tokenizer, data="qqp"):    
    if data == "paws":
        source, target = batch["sentence1"], batch["sentence2"]
    else:
        source, target = batch["question1"], batch["question2"]
        
    source_tokenized = tokenizer(
        source, truncation=True
    )
    target_tokenized = tokenizer(
        target, truncation=True
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [21]:
tokenized_qqp = qqp.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, data="qqp"
    ),
    batched=True,
    remove_columns=qqp['train'].column_names
)

tokenized_paws = filtered_paws.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, data="paws"
    ),
    batched=True,
    remove_columns=filtered_paws['train'].column_names
)

  0%|          | 0/135 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [22]:
print(tokenized_qqp)
print(tokenized_paws)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 134378
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14885
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21829
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3539
    })
})


In [23]:
assert tokenized_qqp["train"].features.type == tokenized_paws["train"].features.type

In [24]:
tokenized_train = datasets.concatenate_datasets([tokenized_qqp["train"],tokenized_paws["train"]])
tokenized_valid = datasets.concatenate_datasets([tokenized_qqp["validation"],tokenized_paws["validation"]])

In [25]:
print(tokenized_train)
print(tokenized_valid)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 156207
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18424
})


In [26]:
print(tokenized_train[0])
print(qqp["train"][0])
print()
print(tokenizer.decode(tokenized_train["input_ids"][0]))
print(tokenizer.decode(tokenized_train["labels"][0]))

{'input_ids': [0, 6179, 109, 38, 797, 127, 46216, 8597, 116, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [0, 6179, 109, 47, 797, 110, 21305, 6601, 116, 2]}
{'question1': 'How do I control my horny emotions?', 'question2': 'How do you control your horniness?', 'label': 1, 'idx': 1}

<s>How do I control my horny emotions?</s>
<s>How do you control your horniness?</s>


## Training

### Metrics: ROUGE

In [27]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [28]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Training arguments

In [29]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [30]:
batch_size = 32
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=2,  
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps = 4,
    warmup_steps=500,
    weight_decay=0.1,
    evaluation_strategy='steps',
    eval_steps=500,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=100,
    save_total_limit=3,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

### Train

In [31]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,2.714,2.603616,66.8642,44.9656,63.4389,63.6437,14.4276
1000,2.5616,2.494521,67.2612,45.6289,63.9844,64.1815,14.3188
1500,2.4694,2.439295,67.6046,46.156,64.3081,64.5119,14.3816
2000,2.4377,2.411692,67.6986,46.3203,64.4054,64.6071,14.4177


TrainOutput(global_step=2440, training_loss=2.591359935823034, metrics={'train_runtime': 3434.6522, 'train_samples_per_second': 90.959, 'train_steps_per_second': 0.71, 'total_flos': 6688496337868800.0, 'train_loss': 2.591359935823034, 'epoch': 2.0})

In [32]:
trainer.evaluate()

{'eval_loss': 2.4027907848358154,
 'eval_rouge1': 67.6975,
 'eval_rouge2': 46.3425,
 'eval_rougeL': 64.3912,
 'eval_rougeLsum': 64.6038,
 'eval_gen_len': 14.4116,
 'eval_runtime': 393.6692,
 'eval_samples_per_second': 46.801,
 'eval_steps_per_second': 1.463,
 'epoch': 2.0}

In [33]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["sentence1"],
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

def get_response(input_text,num_return_sequences,num_beams):
    batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(device)
    translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

In [42]:
text = "Put what you want changed in this section. Then, click the paraphrase button below. It's that easy!"
get_response(text, 10, 10)

["If you want to change something in this section, click the paraphrase button below. It's that easy!",
 "What you want changed in this section. Then, click the paraphrase button below. It's that easy!",
 'What do you want changed in this section?',
 "If you want to change something in this section then click the paraphrase button below. It's that easy!",
 'In this section, put what you want changed in this section. Then, click the paraphrase button below.',
 'If you want to change something in this section, click the paraphrase button below.',
 'What do you want to change in this section?',
 'How do I change what I want in this section?',
 'What you want changed in this section?',
 'What should I change in this section?']

In [35]:
from huggingface_hub import login

token = "hf_"
login(token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [36]:
model_repo = "harouzie/bart-paraphraser"
model.push_to_hub(model_repo)
tokenizer.push_to_hub(model_repo)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/harouzie/bart-paraphraser/commit/f3006102edaf1f89b0ef9d06c9958c3a4172ad6b', commit_message='Upload tokenizer', commit_description='', oid='f3006102edaf1f89b0ef9d06c9958c3a4172ad6b', pr_url=None, pr_revision=None, pr_num=None)