In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from transformers import TrainingArguments, Trainer

## Loading GPT-2

In [None]:
# Cargar modelo y tokenizador
model_path = "./gpt2_model_trained_1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

print("Modelo cargado correctamente.")

Modelo cargado correctamente.


## Preparing data

In [None]:
data_path = "data_2.txt"
dataset = load_dataset("text", data_files={"train": data_path})

# Ver un ejemplo del dataset
(dataset["train"][:4])

Generating train split: 0 examples [00:00, ? examples/s]

{'text': ['Título: No Hieras Mi Corazón',
  'Artista: Gian Marco',
  'Sentimientos: positivo, miedo, alegria',
  'bien ahi quieres ir hagas rogar nena voy extrañar si cosas van cambiando vamos avanzando papel acabando escribir cansando solo cielo azul solo verde jardin cambies color solo quieres ir favor hieras corazon favor quiero romper reloj quiero pronto salida quiero ayude dias unica pastilla unica salidayou might also like']}

In [None]:
def tokenize_function(examples):
    tokenized_output = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

tokenized_dataset = dataset.map(tokenize_function, batched=True)

## Training GPT-2

In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./results",  # Carpeta donde guardar los modelos entrenados
    run_name="fine_tuning_gpt2_lyrics",
    overwrite_output_dir=True,  # Sobrescribir salida previa
    num_train_epochs=3,  # Número de épocas
    per_device_train_batch_size=4,  # Tamaño del batch
    save_steps=500,  # Guardar cada 500 pasos
    save_total_limit=2,  # Guardar solo los 2 últimos checkpoints
    logging_dir="./logs", # Directorio para logs
    report_to="none",
    logging_steps=1000,  # Intervalo de logs
    do_train=True,  # Habilitar entrenamiento
    do_eval=False,  # Omitir evaluación
    warmup_steps=100,  # Número de pasos de calentamiento
    learning_rate=5e-5,  # Tasa de aprendizaje
    weight_decay=0.01,  # Decaimiento de pesos
)


In [7]:
# Crear el objeto Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

# Iniciar el entrenamiento
trainer.train()


Step,Training Loss
1000,0.3277
2000,0.3317
3000,0.2831
4000,0.2922
5000,0.2503
6000,0.2852


TrainOutput(global_step=6093, training_loss=0.29489273103845154, metrics={'train_runtime': 4342.8419, 'train_samples_per_second': 5.612, 'train_steps_per_second': 1.403, 'total_flos': 6368209403904000.0, 'train_loss': 0.29489273103845154, 'epoch': 3.0})

In [8]:
model.save_pretrained("./gpt2_model_trained_2")
tokenizer.save_pretrained("./gpt2_model_trained_2")

('./gpt2_model_trained_2/tokenizer_config.json',
 './gpt2_model_trained_2/special_tokens_map.json',
 './gpt2_model_trained_2/vocab.json',
 './gpt2_model_trained_2/merges.txt',
 './gpt2_model_trained_2/added_tokens.json',
 './gpt2_model_trained_2/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!mkdir -p /content/drive/MyDrive/gpt2_model_trained_2
!cp -r ./gpt2_model_trained_2/ /content/drive/MyDrive/gpt2_model_trained_2

Mounted at /content/drive
