In [None]:
# !pip install --upgrade pip
# !pip install sentencepiece
# !pip install polars
# !pip install transformers[torch]
# !pip install "accelerate>=0.26.0"
# !pip install python-dotenv
# !pip install datasets
# !pip install protobuf


In [6]:
import polars as pl
import torch
from transformers import (AutoTokenizer,
                          AutoModelForSeq2SeqLM,
                          Seq2SeqTrainer,
                          Seq2SeqTrainingArguments,
                          DataCollatorForSeq2Seq)
from datasets import load_dataset, Dataset, DatasetDict
import sentencepiece
import accelerate
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load environment variables (API keys) from .env file
import os
from dotenv import load_dotenv
load_dotenv()

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

In [12]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("BramVanroy/ul2-small-dutch-simplification-mai-2023", legacy=False)
model = AutoModelForSeq2SeqLM.from_pretrained("BramVanroy/ul2-small-dutch-simplification-mai-2023")



In [None]:
df = pl.read_parquet('hf://datasets/UWV/Leesplank_NL_wikipedia_simplifications_preprocessed/data/train-*.parquet')
# print(df.columns)
ds = Dataset.from_pandas(df.to_pandas())

In [None]:
# Check if you have a GPU, otherwise default to CPU
if torch.backends.mps.is_available():  # Check for AMD ROCm GPU
    device = torch.device("mps")
    print("mps")
else:
    device = torch.device("cpu")
    print("cpu")
model.to(device)


def tokenize_function(examples):
    return tokenizer(examples["prompt"], text_target=examples["result"], padding="max_length", truncation=True)

# Tokenize datasets
#tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets = ds.map(tokenize_function, batched=True, remove_columns=['prompt', 'result', '__index_level_0__'])

In [None]:
# Split the tokenized dataset (e.g., 80% train, 20% validation)
splits = tokenized_datasets.train_test_split(test_size=0.2)

# Check the split datasets
train_dataset = splits["train"]
valid_dataset = splits["test"]
 


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Enable mixed precision training on ROCm-enabled AMD GPUs
    save_strategy="epoch"
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()

In [None]:
# Save the model
login('HUGGINGFACE_TOKEN')
trainer.push_to_hub("UWV/ul2-small-dutch-simplification-okt-2024")