In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

In [5]:
# 1. Carregar o modelo pré-treinado
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [6]:
from datasets import Dataset

# Carregar texto bruto
with open("txts/arquivo_convertido.txt", "r", encoding="utf-8") as f:
    data = f.read()

# Normalizar divisões entre parágrafos
normalized_data = data.replace("\n", "\n\n").replace("\n\n\n", "\n\n")  # Garante divisões consistentes

# Dividir em parágrafos
paragraphs = [p.strip() for p in normalized_data.split("\n\n") if p.strip()]  # Remove espaços e linhas vazias

# Criar dataset
dataset = Dataset.from_dict({"text": paragraphs})

# Verificar o dataset
print(dataset)
print(len(dataset))

Dataset({
    features: ['text'],
    num_rows: 66613
})
66613


In [7]:
print(dataset["text"][:5])

['UNIVERSIDADE FEDERAL DE SERGIPE', 'reitor', 'Prof. Dr. Angelo Roberto Antoniolli', 'vice-reitor', 'Prof. Dr. André Maurício Conceição de Souza']


In [8]:
split_datasets = dataset.train_test_split(test_size=0.3)  # 10% para validação
print(split_datasets)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 46629
    })
    test: Dataset({
        features: ['text'],
        num_rows: 19984
    })
})


In [9]:
# Função de tokenização
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Aplicar a tokenização
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Dividir o dataset tokenizado em treino e validação
split_datasets = tokenized_datasets.train_test_split(test_size=0.3)

# Verificar o dataset tokenizado
print(split_datasets["train"].column_names)

Map:   0%|          | 0/66613 [00:00<?, ? examples/s]

['text', 'input_ids', 'attention_mask', 'labels']


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,  # Avaliar menos frequentemente
    logging_dir="./logs",
    logging_steps=100,  # Logar menos frequentemente
    learning_rate=5e-5,  # Aumentar taxa de aprendizado
    num_train_epochs=3,  # Reduzir para 3 épocas
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # Batch size efetivo = 4
    save_steps=1000,  # Salvar menos frequentemente
    save_total_limit=2,
    fp16=True,
    max_grad_norm=1.0,
    gradient_checkpointing=True,
    remove_unused_columns=True,
)



In [12]:
small_train = split_datasets["train"].select(range(5000))  # 10.000 exemplos
small_test = split_datasets["test"].select(range(500))  # 2.000 exemplos

In [13]:
# # Configurar o Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=split_datasets["train"],
#     eval_dataset=split_datasets["test"],
# )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test,
)

In [14]:
# Treinar o modelo
trainer.train()

../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [26,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [26,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [26,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [26,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [26,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [26,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [26,0,0], t

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
def generate_text(prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.5,  # Controle intermediário de criatividade
        top_k=50,        # Limita a seleção a 50 tokens mais prováveis
        top_p=0.9,       # Nucleus sampling
        repetition_penalty=1.2,  # Penaliza repetições
        do_sample=True   # Ativar amostragem
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Pergunta ajustada
prompt = "Pergunta: Como identificar o sujeito em uma oração?\nResposta:"
response = generate_text(prompt)
print(response)