## Instalación de librerías

In [None]:
! pip install transformers[torch,sentencepiece]
! pip install datasets
! pip install evaluate
! pip install sacremoses
! pip install sacrebleu

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from datasets import Dataset
import evaluate
import numpy as np
import os

## Creación de Datasets

Definición de funciones para la creación de los datasets.
Estrategia:  pares de oraciones en bloques de 10, usando una línea para conjunto de desarrollo, otra para el de prueba y el resto para entrenamiento.

In [None]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

def read_alignments(alignment_path):
    alignments = []
    with open(alignment_path, 'r', encoding='utf-8') as file:
        for line in file:
            src, tgt, _ = line.split(':')
            src_indices = [int(i) for i in src.strip('[]').split(',') if i] if src else []
            tgt_indices = [int(i) for i in tgt.strip('[]').split(',') if i] if tgt else []
            alignments.append((src_indices, tgt_indices))
    return alignments

def create_dataset(source_path, target_path, alignment_path):
    source_lines = read_file(source_path)
    target_lines = read_file(target_path)
    alignments = read_alignments(alignment_path)

    src_texts = []
    tgt_texts = []
    for src_indices, tgt_indices in alignments:
        src_text = ' '.join(source_lines[i] for i in src_indices if i < len(source_lines))
        tgt_text = ' '.join(target_lines[i] for i in tgt_indices if i < len(target_lines))
        if src_text and tgt_text:  # Ensure both source and target texts are not empty
            src_texts.append(src_text)
            tgt_texts.append(tgt_text)

    data = {'src': src_texts, 'tgt': tgt_texts}
    return Dataset.from_dict(data)

def split_dataset(dataset):
    total_entries = len(dataset)

    # Crea listas para almacenar los índices de cada conjunto
    train_indices = []
    test_indices = []
    dev_indices = []

    # Asigna índices
    for i in range(total_entries):
        if i % 10 == 9:  # Cada décima entrada, alternar entre test y dev
            dev_indices.append(i)
        elif i % 10 == 8:
            test_indices.append(i)
        else:
            train_indices.append(i)

    # Crea los subconjuntos usando los índices
    train_dataset = dataset.select(train_indices)
    test_dataset = dataset.select(test_indices)
    dev_dataset = dataset.select(dev_indices)

    return train_dataset, test_dataset, dev_dataset

Carga de los ficheros del texto extraídos y el fichero que contiene sólo los índices de los pares alineados.

In [None]:
from google.colab import files

uploaded = files.upload()

Saving extracted_texts(Aranes)_temp_0.5_procesado.txt to extracted_texts(Aranes)_temp_0.5_procesado.txt


Se añade la ruta de los archivo cargados.
*En el caso que estén renombrados, modificar en el código.

In [None]:
source_path = '/content/extracted_texts(Aranes)_temp_0.5_procesado.txt'
target_path = '/content/extracted_texts(Español)_temp0.5_procesado.txt'
alignment_path = '/content/indices alineaciones_aranes_español.txt'
dataset = create_dataset(source_path, target_path, alignment_path)

In [None]:
# Aplica la función de división
train_dataset, test_dataset, dev_dataset = split_dataset(dataset)

# Verifica los tamaños de cada conjunto
print(f"Train size: {len(train_dataset)}")
print(f"Test size: {len(test_dataset)}")
print(f"Dev size: {len(dev_dataset)}")
print(train_dataset[0])

def save_dataset_to_txt(dataset, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for entry in dataset:
            src = entry['src']
            tgt = entry['tgt']
            file.write(f"source: '{src}'\n")
            file.write(f"target: '{tgt}'\n\n")

def save_test_sources_to_txt(dataset, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for example in dataset:
            f.write(f"source: '{example['src']}'\n")
            f.write("target:\n\n")



save_dataset_to_txt(train_dataset, '/content/train_dataset.txt')
save_dataset_to_txt(test_dataset, '/content/test_dataset.txt')
save_dataset_to_txt(dev_dataset, '/content/dev_dataset.txt')

# Guarda conjunto con contenido de sólo el idioma fuente
save_test_sources_to_txt(test_dataset, 'test_sources.txt')

Train size: 1109
Test size: 138
Dev size: 138
{'src': 'O Quan auia sies ans, un viatge, vedé ua magnifica imatge, en un libre sus eth Bòsc Verge que se didie «Istòries Vesucudes».', 'tgt': 'Cuando yo tenía seis años vi en un libro sobre la selva virgen que se titulaba "Historias vividas", una magnífica lámina.'}


##Traducir el conjunto de pruebas utilizando el modelo preentrenado

In [None]:
def evaluate_model(modelo):
  # Obtiene las frases de entrada y las referencias del test_dataset
  inputs = [ex['src'] for ex in test_dataset]
  references = [[ex['tgt']] for ex in test_dataset]  # las referencias deben estar en una lista de listas

  # Configurar la pipeline de traducción
  translator = pipeline("translation", model=modelo, device="cuda:0", batch_size=32)  # Usando GPU

  # Traducir las frases
  outputs = translator(inputs)
  translated_texts = [out['translation_text'] for out in outputs]

  # Cargar la métrica sacrebleu para evaluación
  metric = evaluate.load("sacrebleu")
  results = metric.compute(predictions=translated_texts, references=references)
  print(results)

  del translator



In [None]:
model_name = 'Helsinki-NLP/opus-mt-ca-es'


In [None]:
evaluate_model(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/281M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/811k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.15M [00:00<?, ?B/s]



Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

{'score': 3.423941420324127, 'counts': [414, 86, 28, 5], 'totals': [1572, 1434, 1297, 1164], 'precisions': [26.33587786259542, 5.99721059972106, 2.158828064764842, 0.42955326460481097], 'bp': 0.984222482004929, 'sys_len': 1572, 'ref_len': 1597}


## Preprocesamiento de los conjuntos de datos antes de su uso para el ajuste fino

El preprocesamiento implica la tokenización de las frases de los conjuntos de datos mediante el tokenizador incluido en el modelo preentrenado

In [None]:
max_input_length = 128
max_target_length = 128


def preprocess_function(examples, source, target, tokenizer):
    inputs = examples[source]
    targets = examples[target]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=True, truncation=True)
    # Asegúrate de usar el mismo tokenizer para las etiquetas, pero no olvides eliminar los parámetros innecesarios
    labels = tokenizer(targets, max_length=max_target_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenizer_ara_es=AutoTokenizer.from_pretrained(model_name)
print(tokenizer_ara_es)
tokenized_train_dataset_ara_es = train_dataset.map(
    preprocess_function,
    batched=True,
    fn_kwargs={
        'source': 'src',
        'target': 'tgt',
        'tokenizer': tokenizer_ara_es
    }
)
tokenized_dev_dataset_ara_es = dev_dataset.map(
    preprocess_function,
    batched=True,
    fn_kwargs={
        'source': 'src',
        'target': 'tgt',
        'tokenizer': tokenizer_ara_es
    }
)

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-ca-es', vocab_size=49621, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	49620: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


Map:   0%|          | 0/1109 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

## Fine Tune del modelo en el conjunto de datos creado anteriormente
Antes de realizar el ajuste fino, tenemos que establecer la métrica de evaluación automática que se utilizará para evaluar en el conjunto de desarrollo, a continuación, vamos a ejecutar el algoritmo de entrenamiento en el conjunto de datos de entrenamiento

### Se define la métrica que se utilizará en el conjunto de desarrollo

In [None]:
import evaluate

metric = evaluate.load("sacrebleu") # BLEU

import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer_ara_es.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    ##labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer_ara_es.batch_decode(labels, skip_special_tokens=True)
    # Post-procesamiento sencillo
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer_ara_es.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Fine-Tunnig

In [None]:
batch_size=64
patience=10
def fine_tune_model(model_name, output_dir, tokenizer, compute_metrics, tokenized_train_dataset, tokenized_dev_dataset, num_train_epochs=80):
    model_file_path = os.path.join(output_dir, "model.safetensors")
    if os.path.exists(model_file_path):
        print(f"Model already fine-tuned and saved at {output_dir}. Skipping training.")
        return

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda:0") # Load in GPU 0

    args = Seq2SeqTrainingArguments(
        output_dir = output_dir,
        evaluation_strategy = "epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_train_epochs,
        predict_with_generate=True,
        fp16=True,
        metric_for_best_model="bleu",
        load_best_model_at_end=True, # Utiliza metric_for_best_model para comparar modelos
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_dev_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=patience)], # It uses metric_for_best_model
    )

    trainer.train()
    trainer.save_model()

    del trainer
    del model
    del data_collator

In [None]:
output_model_ara_path='/content/fine-tuned_ara_es'

In [None]:
fine_tune_model(model_name, output_model_ara_path, tokenizer_ara_es, compute_metrics, tokenized_train_dataset_ara_es, tokenized_dev_dataset_ara_es)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,2.341304,4.8289,20.471
2,No log,1.476789,5.804,20.2754
3,No log,1.259487,7.1865,20.9275
4,No log,1.151066,7.4305,20.6014
5,No log,1.081284,8.0776,20.6014
6,No log,1.028562,10.5941,20.5942
7,No log,0.990961,10.9551,20.058
8,No log,0.961454,10.5349,20.3406
9,No log,0.935577,10.6743,20.4638
10,No log,0.913842,11.3288,20.558


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[49620]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[49620]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[49620]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[49620]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[49620]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[49620]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[49620]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[49620]], 'forced_eos_token_id': 0}
