In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import os

# Set CUDA_LAUNCH_BLOCKING for better debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [16]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import os

# Set CUDA_LAUNCH_BLOCKING for better debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

df = pd.read_csv('filter_data.csv')
# Inicializar el modelo de GPT-2
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

#  Assign the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Crear una lista para almacenar los datos preparados
prepared_data = []

# Iterar sobre cada fila del DataFrame
for index, row in df.iterrows():
    # Concatenar letras con sentimientos
    input_text = f"{row['lyrics']} [SENTIMENTS] {row['feelings']}"
    prepared_data.append(input_text)

# Tokenizar los datos, adding padding and truncation to ensure consistent input length
tokenized_data = [tokenizer(text, return_tensors='pt', padding='max_length', max_length=512, truncation=True) for text in prepared_data]



In [17]:
# Mover el modelo a la GPU (si está disponible)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)

cuda


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-4)

In [18]:
# Configuración de parámetros
batch_size = 8  # Reduce batch size significantly
num_epochs = 3  # Ajusta según sea necesario

# Entrenamiento
for epoch in range(num_epochs):
    # Procesar datos en lotes para reducir el uso de memoria
    for i in range(0, len(tokenized_data), batch_size):
        batch = tokenized_data[i:i + batch_size]

        # Create a dictionary to store the batched tensors
        batch_tensors = {}

        # Extract and stack the relevant tensors from the batch
        for key in batch[0].keys():  # Get the keys from the first BatchEncoding in the batch
            batch_tensors[key] = torch.stack([b[key] for b in batch]).to(device)

        # Realizar entrenamiento con el lote
        # Use gradient accumulation to simulate a larger batch size
        # Accumulate gradients over 'accumulation_steps' before updating weights
        accumulation_steps = 4 # Adjust as needed

        for step in range(accumulation_steps):
            optimizer.zero_grad()  # Asegúrate de reiniciar los gradientes antes de cada batch

            # Adelante (Forward) para el lote
            outputs = model(**batch_tensors, labels=batch_tensors['input_ids']) # Pass the batched tensors as keyword arguments
            loss = outputs.loss / accumulation_steps # Normalize loss for gradient accumulation

            # Retropropagación (Backward)
            loss.backward()

            if (step + 1) % accumulation_steps == 0: # Update weights every accumulation_steps
                # Actualizar los pesos
                optimizer.step()

            print(f"Epoch: {epoch}, Step: {step}, Loss: {loss.item()}")

Epoch: 0, Step: 0, Loss: 1.200926661491394
Epoch: 0, Step: 1, Loss: 1.200926661491394
Epoch: 0, Step: 2, Loss: 1.200926661491394
Epoch: 0, Step: 3, Loss: 1.200926661491394
Epoch: 0, Step: 0, Loss: 1.1323869228363037
Epoch: 0, Step: 1, Loss: 1.1323869228363037
Epoch: 0, Step: 2, Loss: 1.1323869228363037
Epoch: 0, Step: 3, Loss: 1.1323869228363037
Epoch: 0, Step: 0, Loss: 1.1162346601486206
Epoch: 0, Step: 1, Loss: 1.1162346601486206
Epoch: 0, Step: 2, Loss: 1.1162346601486206
Epoch: 0, Step: 3, Loss: 1.1162346601486206
Epoch: 0, Step: 0, Loss: 1.2193489074707031
Epoch: 0, Step: 1, Loss: 1.2193489074707031
Epoch: 0, Step: 2, Loss: 1.2193489074707031
Epoch: 0, Step: 3, Loss: 1.2193489074707031
Epoch: 0, Step: 0, Loss: 1.2308796644210815
Epoch: 0, Step: 1, Loss: 1.2308796644210815
Epoch: 0, Step: 2, Loss: 1.2308796644210815
Epoch: 0, Step: 3, Loss: 1.2308796644210815
Epoch: 0, Step: 0, Loss: 1.3247015476226807
Epoch: 0, Step: 1, Loss: 1.3247015476226807
Epoch: 0, Step: 2, Loss: 1.324701547

In [19]:
model.save_pretrained('./trained_gpt2_spa_local')
tokenizer.save_pretrained('./trained_gpt2_spa_local')

('./trained_gpt2_spa_local/tokenizer_config.json',
 './trained_gpt2_spa_local/special_tokens_map.json',
 './trained_gpt2_spa_local/vocab.json',
 './trained_gpt2_spa_local/merges.txt',
 './trained_gpt2_spa_local/added_tokens.json')

In [22]:
import shutil

# Comprimir el directorio en un archivo zip
shutil.make_archive('trained_gpt2_spa_local', 'zip', './trained_gpt2_spa_local')


'/content/trained_gpt2_spa_local.zip'

In [24]:
from google.colab import files  # Solo si usas Google Colab

# Descargar el archivo ZIP
files.download('trained_gpt2_spa_local.zip')  # Asegúrate de que el archivo ZIP existe en el directorio actual

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
# prompt: dime cómo guardar un .zip en /content/drive

shutil.move('trained_gpt2_spa_local.zip', '/content/drive/MyDrive/')

'/content/drive/MyDrive/trained_gpt2_spa_local.zip'