## Fine Tuning
Had to switch from my lab VM to Google Collab for this since I needed a GPU.

In [None]:
!pip install transformers datasets evaluate transformers[torch] py7zr

### Full fine-tuning for summarization

In [None]:
## Load model and tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
MODEL = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

#### Load dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("knkarthick/samsum")

## Clean dataset
dataset = dataset.remove_columns(['id'])
dataset = dataset.filter(lambda example: example['dialogue'] is not None)

## Shrink dataset for training
PERCENT = 1

dataset['train'] = dataset['train'].shuffle(seed=42).select(range(int(len(dataset['train'])*PERCENT)))
dataset['test'] = dataset['test'].shuffle(seed=37).select(range(int(len(dataset['test'])*PERCENT)))
dataset['validation'] = dataset['validation'].shuffle(seed=4).select(range(int(len(dataset['validation'])*PERCENT)))

dataset

#### Test summarization of base model

In [None]:
sample = dataset["test"][0]['dialogue']
label = dataset["test"][0]['summary']

def generate_summary(input, llm):
  prompt = f"""
  Summarize the following conversation.

  {input}

  Summary:
  """

  input_ids = tokenizer(prompt, return_tensors="pt")
  output = llm.generate(input_ids["input_ids"], max_new_tokens=200)
  return tokenizer.decode(output[0], skip_special_tokens=True)

output = generate_summary(sample, model)
print("Sample")
print(sample)
print("----------------------------------------")
print("Model Generated Summary")
print(output)
print("Correct Summary")
print(label)

#### Prepare the dataset

In [None]:
def tokenize_inputs(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '

    # Tokenize inputs
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    model_inputs = tokenizer(prompt, padding="max_length", max_length=200, truncation=True)

    # Tokenize labels
    labels = tokenizer(example["summary"], padding="max_length", max_length=200, truncation=True)

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = dataset.map(tokenize_inputs, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['dialogue', 'summary'])
tokenized_dataset

#### Start training

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import TrainingArguments, Trainer

MODEL = "bart-cnn-samsum-finetuned"

training_args = TrainingArguments(
    output_dir="./" + MODEL,
    hub_model_id="shayharding/" + MODEL,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    auto_find_batch_size=True,
    eval_strategy="epoch",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

#### Test the fine-tuned model

In [None]:
MODEL = "shayharding/bart-cnn-samsum-finetuned"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

output = generate_summary(sample, model)

print("Sample")
print(sample)
print("----------------------------------------")
print("Model Generated Summary")
print(output)
print("Correct Summary")
print(label)