<a href="https://colab.research.google.com/github/jcmachicaocuf/codigos_CUF_LLM_NLP/blob/main/U3__fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AdamW
import os

In [None]:
class SimpleTextDataset(Dataset):
    def __init__(self, text_file, tokenizer, max_length=128):
        with open(text_file, 'r', encoding='utf-8') as f:
            texts = f.read().split('\n')

        self.encodings = tokenizer([text for text in texts if text],
                                 truncation=True,
                                 max_length=max_length,
                                 padding='max_length',
                                 return_tensors='pt')

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings.input_ids[idx],
            'attention_mask': self.encodings.attention_mask[idx],
            'labels': self.encodings.input_ids[idx]
        }

In [None]:
def train_epoch(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Mover batch a GPU/CPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass y optimización
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return total_loss / len(train_loader)

In [None]:
def fine_tune_gpt2(text_file, output_dir="./fine_tuned_gpt2", epochs=3):
    # Configurar device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Cargar modelo y tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Configurar tokenizer
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))

    # Mover modelo al device
    model.to(device)

    # Preparar dataset
    dataset = SimpleTextDataset(text_file, tokenizer)
    train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

    # Configurar optimizador
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Training loop
    print("Iniciando entrenamiento...")
    for epoch in range(epochs):
        avg_loss = train_epoch(model, train_loader, optimizer, device)
        print(f"Epoch {epoch + 1}/{epochs}, Average loss: {avg_loss:.4f}")

    # Guardar modelo y tokenizer
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Modelo guardado en {output_dir}")

    return model, tokenizer

In [None]:
def generate_text(prompt, model, tokenizer, max_length=100):
    # Pasar modelo a evaluación
    model.eval()

    # Preparar input
    inputs = tokenizer.encode(prompt, return_tensors='pt')

    # Generar texto
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Ejemplo de uso
text_file = "datos_train_v4.txt"  # Debes crear este archivo con tus textos

# Fine-tuning
model, tokenizer = fine_tune_gpt2(text_file, epochs=1)

# Generar texto de ejemplo
prompt = "What is the difference between perkoars and darguiens and how they help eachother in the system?"
generated = generate_text(prompt, model, tokenizer)
print("\nTexto generado:")
print(generated)

Using device: cpu




Iniciando entrenamiento...
Epoch 1/1, Average loss: 6.0594


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Modelo guardado en ./fine_tuned_gpt2

Texto generado:
What is the difference between perkoars and darguiens and how they help eachother in the system?

The perkars are the most important part of the game. They are a very important thing. The dkuien are very useful.
.
