In [None]:
import torch
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load your CSV dataset
df_ = pd.read_csv("data_summary.csv")

df = df_.dropna()

# Split the dataset into training and validation sets (70% training, 30% validation)
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

# Access the input text and target summaries for training and validation
input_texts_train = train_df["input_text"].tolist()
target_summaries_train = train_df["target_summary"].tolist()

input_texts_val = val_df["input_text"].tolist()
target_summaries_val = val_df["target_summary"].tolist()

# Load the pretrained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_total_limit=2,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=1000,
    logging_dir="./logs",
    load_best_model_at_end=True,
    push_to_hub=False,
    remove_unused_columns=False,
    report_to="tensorboard",
)

# Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model
)

# Create datasets.Dataset for training and validation
train_dataset = Dataset.from_dict({
    "input_text": input_texts_train,
    "target_summary": target_summaries_train,
})

val_dataset = Dataset.from_dict({
    "input_text": input_texts_val,
    "target_summary": target_summaries_val,
})

# Tokenize your datasets using the tokenizer with padding and truncation enabled
train_dataset = train_dataset.map(
    lambda examples: tokenizer(
        examples["input_text"],
        examples["target_summary"],
        padding="max_length",  # Enable padding
        truncation=True,        # Enable truncation
        max_length=512,         # Adjust as needed
        return_tensors="pt",   # Return PyTorch tensors
    ),
    batched=True,
)

val_dataset = val_dataset.map(
    lambda examples: tokenizer(
        examples["input_text"],
        examples["target_summary"],
        padding="max_length",  # Enable padding
        truncation=True,        # Enable truncation
        max_length=512,         # Adjust as needed
        return_tensors="pt",   # Return PyTorch tensors
    ),
    batched=True,
)


# Create a Seq2Seq Trainer for training data
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,  # Use the training data
)

# Fine-tune the model
trainer.train()

# Create a Seq2Seq Trainer for validation data
val_trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    eval_dataset=val_dataset,  # Use the validation data
)

# Evaluate the model on the validation dataset
results = val_trainer.evaluate()

# Save the final model
trainer.save_model("./fine_tuned_model")

# Save the tokenizer
tokenizer.save_pretrained("./fine_tuned_model")

# Optionally, upload the model to the Hugging Face Model Hub
# trainer.push_to_hub()

# Optionally, test the model on some sample inputs
input_text = "Your input text here..."
input_ids = tokenizer.encode(input_text, return_tensors="pt")
summary_ids = model.generate(input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
output_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", output_summary)


In [2]:
import torch
import pandas as pd
from transformers import AutoTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainingArguments
from sklearn.model_selection import train_test_split

device = "mps" if torch.backends.mps.is_available () else "cpu"

# Load the CSV dataset
df = pd.read_csv("data_summary.csv")

# Remove rows with missing values
df.dropna(inplace=True)

# Split the dataset into train and validation sets (70% train, 30% validation)
input_texts, target_summaries = df["input_text"].tolist(), df["target_summary"].tolist()
input_texts_train, input_texts_val, target_summaries_train, target_summaries_val = train_test_split(
    input_texts, target_summaries, test_size=0.3, random_state=42
)

# Load the pretrained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
#tokenizer = AutoTokenizer.from_pretrained("Davlan/naija-twitter-sentiment-afriberta-large")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

model.to(device)

# Tokenize your datasets using the tokenizer with padding and truncation enabled
train_encodings = tokenizer(
    input_texts_train,
    target_summaries_train,
    padding="max_length",  # Enable padding
    truncation=True,        # Enable truncation
    max_length=512,         # Adjust as needed
    return_tensors="pt",   # Return PyTorch tensors
    return_attention_mask=True,
)

val_encodings = tokenizer(
    input_texts_val,
    target_summaries_val,
    padding="max_length",  # Enable padding
    truncation=True,        # Enable truncation
    max_length=512,         # Adjust as needed
    return_tensors="pt",   # Return PyTorch tensors
    return_attention_mask=True,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_total_limit=2,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=1000,
    logging_dir="./logs",
    load_best_model_at_end=True,
    push_to_hub=False,
    remove_unused_columns=False,
    report_to="tensorboard",
)

# Define data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model
)

# Create a PyTorch Dataset for training and validation
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels["input_ids"][idx]
        return item

    def __len__(self):
        return len(self.labels["input_ids"])

train_dataset = CustomDataset(train_encodings, train_encodings)
val_dataset = CustomDataset(val_encodings, val_encodings)

# Define Seq2Seq training arguments
seq2seq_training_args = Seq2SeqTrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_total_limit=2,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=1000,
    logging_dir="./logs",
    load_best_model_at_end=True,
    push_to_hub=False,
    remove_unused_columns=False,
    report_to="tensorboard",
    predict_with_generate=True,  # Enable text generation during evaluation
    generation_max_length=50,   # Adjust as needed for maximum output length
)

# Create a Seq2Seq Trainer for training data
trainer = Seq2SeqTrainer(
    model=model,
    args=seq2seq_training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,  # Use the training data
)

# Fine-tune the model
trainer.train()

# Create a Seq2Seq Trainer for validation data
val_trainer = Seq2SeqTrainer(
    model=model,
    args=seq2seq_training_args,
    data_collator=data_collator,
    eval_dataset=val_dataset,  # Use the validation data
)

# Evaluate the model on the validation dataset
results = val_trainer.evaluate()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible


  0%|          | 0/2394 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 42%|████▏     | 1000/2394 [42:34<45:44,  1.97s/it] 

{'loss': 1.6089, 'learning_rate': 2.9114452798663327e-05, 'epoch': 1.25}


ValueError: Trainer: evaluation requires an eval_dataset.