In [None]:
# %%capture
# ! pip install huggingface_hub
# ! pip install torch
# ! pip install transformers==4.40.2
# ! pip install datasets
# ! pip install evaluate
# ! pip install sacrebleu
# ! pip install rouge_score

In [None]:
from dotenv import load_dotenv
import os
HF_TOKEN = os.getenv('HF_TOKEN')

In [None]:
! huggingface-cli login --token $HF_TOKEN

In [None]:
import pandas as pd

In [None]:
path_data = "=datas/new_train.csv"

In [None]:
raw_df = pd.read_csv(path_data)

In [None]:
len(raw_df) - len(raw_df) * 0.2

In [None]:
train_df = raw_df.copy()
val_df = raw_df[6144:].copy()

In [None]:
print(f"{train_df.English[0]} ---> {train_df.Twi[0]}")

In [None]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'val': val_dataset,
})

dataset

In [None]:
CHARS_TO_REMOVE_REGEX = '[!"&\(\),-./:;=?+.\n\[\]]'
SRC_LANG = "English"
TRG_LANG = "Twi"

In [None]:
# Clean data (lowercase and punctuation)
import re
def remove_special_characters(text):
    text = re.sub(CHARS_TO_REMOVE_REGEX, " ", text)
    return text.strip()

def clean_text(batch):
    # process source text
    batch[SRC_LANG] = remove_special_characters(batch[SRC_LANG])
    # process target text
    batch[TRG_LANG] = remove_special_characters(batch[TRG_LANG])
    return batch

dataset = dataset.map(clean_text)

In [None]:
dataset['train'][32]['English']

In [None]:
dataset['train'][32]['Twi']

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM , NllbTokenizerFast

tokenizer = NllbTokenizerFast.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="twi_Latn"
)
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
max_length = 64

def preprocess_function(examples):
    inputs = examples[SRC_LANG]
    targets = examples[TRG_LANG]

    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )

    return model_inputs


In [None]:
tokenized_datasets = dataset.map(preprocess_function,
                                 remove_columns=dataset["train"].column_names,
                                 batched=True)

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

In [None]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

In [None]:
import evaluate

# Load both ROUGE and BLEU metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Check if preds is a tuple and extract the logits if it is
    if isinstance(preds, tuple):
        preds = preds[0]

    # Ensure preds are integers and within valid range
    preds = np.array(preds, dtype=np.int64)
    if np.any(preds < 0) or np.any(preds >= tokenizer.vocab_size):
        print("Warning: Preds contain out-of-range values")
        preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    # Decode the predictions using the tokenizer
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Compute the ROUGE scores
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_scores = {
        "rouge1": rouge_result["rouge1"],
    }

    # Compute the BLEU score
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    bleu_score = {"bleu": bleu_result["score"]}

    # Combine both metrics into a single dictionary
    result = {**rouge_scores, **bleu_score}

    return result



In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"saved_model",
    report_to='none',
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=25,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
import random
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm, trange
from transformers.optimization import Adafactor, AdamW
from transformers import get_linear_schedule_with_warmup, get_constant_schedule_with_warmup

import gc
import numpy as np

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [None]:
trainer.evaluate(max_length=max_length)

In [None]:
cleanup()

In [None]:
trainer.train()
tokenizer.save_pretrained("saved_model")
model.save_pretrained("saved_model")
trainer.save_model("saved_model")