###### Imports

In [1]:
from transformers import AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AutoModelForSequenceClassification, TrainingArguments, Trainer, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from datasets import load_dataset
import evaluate
import pandas as pd
import nltk
import numpy as np

nltk.download('punkt')
import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')
warnings.filterwarnings("ignore", category=UserWarning)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kiddstudio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


###### Load data

In [2]:
# Load data
test_dataset = pd.read_csv('../data/processed/new_ds_test_dataset.csv')
train_dataset = pd.read_csv('../data/processed/new_ds_train_dataset.csv')
validation_dataset = pd.read_csv('../data/validation_set.csv')

# Ensure all documents are strings
train_dataset['document'] = train_dataset['document'][:150].astype(str)
train_dataset['summary'] = train_dataset['summary'][:150].astype(str)
test_dataset['document'] = test_dataset['document'][:50].astype(str)
test_dataset['summary'] = test_dataset['summary'][:50].astype(str)
validation_dataset['document'] = validation_dataset['document'][:50].astype(str)
validation_dataset['summary'] = validation_dataset['summary'][:50].astype(str)

# Handle missing values (remove rows with missing documents or summaries)
train_dataset.dropna(subset=['document', 'summary'], inplace=True)
test_dataset.dropna(subset=['document', 'summary'], inplace=True)
validation_dataset.dropna(subset=['document', 'summary'], inplace=True)

# Convert the pandas DataFrames to Hugging Face Datasets
train = Dataset.from_pandas(train_dataset)
test = Dataset.from_pandas(test_dataset)
validation = Dataset.from_pandas(validation_dataset)

# Reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict({
    'train': train,
    'test': test,
    'validation': validation
})

Showing the data here because the due to resource and time constraints, the entire dataset was not used nor uploaded to Github

In [3]:
train_dataset['document'][:10]

0    national archives yes its time again folks its...
1    los angeles ap  in first interview since nba b...
2    gaithersburg md ap  a small private jet crashe...
3    tucker carlson exposes his own sexism twitter ...
4    a man accused removing another mans testicle m...
5    suicide hotlines provide free confidential sup...
6    croatia swastika hosts apologise nazi pitch sy...
7    warczone collection outsideruploaded warcs con...
8    vantage energy operates natural gas drilling s...
9    photo roy hsu in lowdown world dining dashing ...
Name: document, dtype: object

In [4]:
train_dataset['summary'][:10]

0    – The unemployment rate dropped to 8.2% last m...
1    – Shelly Sterling plans "eventually" to divorc...
2    – A twin-engine Embraer jet that the FAA descr...
3    – Tucker Carlson is in deep doodoo with conser...
4    – What are the three most horrifying words in ...
5    – Calls to suicide hotlines have spiked dramat...
6    – Public apologies making headlines this week ...
7    – Education Secretary John King has a message ...
8    – A massive leak of fracking fluid poured into...
9    – Paul Gonzales' approach to dating is similar...
Name: summary, dtype: object

In [5]:
test_dataset['document'][:10]

0    gop eyes gains as voters in 11 states pick gov...
1    update 4192001 read richard metzger how i marr...
2    its golden states latest version great secessi...
3    the seed crawl list every host wayback machine...
4    after year liberals scored impressive highprof...
5    if true building set for demolition could be m...
6    a still image taken israeli defence forces idf...
7    paris ap  the pompidou centre paris hopes disp...
8    starting 1996 alexa internet donating crawl da...
9    the wounded officer crystal almeida 26 the thi...
Name: document, dtype: object

In [6]:
test_dataset['summary'][:10]

0    – It's a race for the governor's mansion in 11...
1    – It turns out Facebook is only guilty of abou...
2    – Not a big fan of Southern California? Neithe...
3    – Why did Microsoft buy Nokia's phone business...
4    – The Supreme Court is facing a docket of high...
5    – In 1783, after the British soldiers left New...
6    – Israel launched a round of airstrikes on Gaz...
7    – A Picasso painting that was found to have va...
8    – A dispute over the freshness of Wendy’s frie...
9    – A 27-year-old Dallas police officer died Wed...
Name: summary, dtype: object

In [7]:
# new_ds = load_dataset("multi_news", split="train").shuffle(seed=42).select(range(200))
# new_ds = new_ds.train_test_split(test_size=0.2)

###### Tokenzer and Preprocessing function

In [8]:
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = new_ds.map(preprocess_function, batched=True)

# Setup evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

###### Compute Metrics

In [9]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

# Load pretrained model and evaluate model after each epoch
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

###### Model training 

In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=True,
    predict_with_generate=True
)

trainer =  Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,3.017151,0.373953,0.102003,0.197713,0.312409
2,No log,2.966034,0.370452,0.1019,0.196722,0.313146


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=20, training_loss=3.237326431274414, metrics={'train_runtime': 3794.2938, 'train_samples_per_second': 0.079, 'train_steps_per_second': 0.005, 'total_flos': 464373625651200.0, 'train_loss': 3.237326431274414, 'epoch': 2.0})