In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from transformers import TrainingArguments, Trainer

## Loading GPT-2

In [None]:
# Cargar modelo y tokenizador
model_path = "./gpt2_model_trained_2"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

print("Modelo cargado correctamente.")

Modelo cargado correctamente.


## Preparing data

In [None]:
data_path = "data_3.txt"
dataset = load_dataset("text", data_files={"train": data_path})

# Ver un ejemplo del dataset
(dataset["train"][:4])

Generating train split: 0 examples [00:00, ? examples/s]

{'text': ['Título: Vaya con Dios',
  'Artista: Julio Iglesias',
  'Sentimientos: positivo, alegria, anticipacion',
  'momento llego separarnos silencio corazon dice suspira vaya dios vida vaya dios amor campanas iglesia suenan tristes parece sonar tambien dicen vaya dios vida vaya dios amor adonde vayas ire contigo sueños junto siempre estare voz escucharas dulce amor mio pensando estaras volvernos siempre ver alborada despertar dice espera corazon voy adonde quieras vaya dios vida vaya dios amor vaya dios vida vaya dios amor you might also like']}

In [None]:
def tokenize_function(examples):
    tokenized_output = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

tokenized_dataset = dataset.map(tokenize_function, batched=True)

## Training GPT-2

In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",  # Carpeta donde guardar los modelos entrenados
    run_name="fine_tuning_gpt2_lyrics",
    overwrite_output_dir=True,  # Sobrescribir salida previa
    num_train_epochs=3,  # Número de épocas
    per_device_train_batch_size=4,  # Tamaño del batch
    save_steps=500,  # Guardar cada 500 pasos
    save_total_limit=2,  # Guardar solo los 2 últimos checkpoints
    logging_dir="./logs", # Directorio para logs
    report_to="none",
    logging_steps=1000,  # Intervalo de logs
    do_train=True,  # Habilitar entrenamiento
    do_eval=False,  # Omitir evaluación
    warmup_steps=100,  # Número de pasos de calentamiento
    learning_rate=5e-5,  # Tasa de aprendizaje
    weight_decay=0.01,  # Decaimiento de pesos
)


In [8]:
# Crear el objeto Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

# Iniciar el entrenamiento
trainer.train()


Step,Training Loss
1000,0.228
2000,0.2493
3000,0.2051
4000,0.2185
5000,0.1882
6000,0.2012


TrainOutput(global_step=6093, training_loss=0.2151465235174377, metrics={'train_runtime': 4275.6445, 'train_samples_per_second': 5.7, 'train_steps_per_second': 1.425, 'total_flos': 6368209403904000.0, 'train_loss': 0.2151465235174377, 'epoch': 3.0})

In [9]:
model.save_pretrained("./gpt2_model_trained_3")
tokenizer.save_pretrained("./gpt2_model_trained_3")

('./gpt2_model_trained_3/tokenizer_config.json',
 './gpt2_model_trained_3/special_tokens_map.json',
 './gpt2_model_trained_3/vocab.json',
 './gpt2_model_trained_3/merges.txt',
 './gpt2_model_trained_3/added_tokens.json',
 './gpt2_model_trained_3/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!mkdir -p /content/drive/MyDrive/gpt2_model_trained_3
!cp -r ./gpt2_model_trained_3/ /content/drive/MyDrive/gpt2_model_trained_3

Mounted at /content/drive
