### Importamos las librerías

In [1]:
!pip install transformers[torch] torch streamlit datasets scikit-learn matplotlib




[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


### Preparar datos de ejemplo

In [3]:
texts = [
     "Los paneles solares son una fuente eficiente de energía renovable.",
     "El carbón sigue siendo una fuente importante de energía en muchos países.",
     "La energía eólica está ganando popularidad en todo el mundo.",
     "Las centrales nucleares son controversiales pero producen energía sin emisiones de CO2.",
     "La biomasa es una forma de energía renovable que utiliza materiales orgánicos.",
 ]
labels = [1, 0, 1, 0, 1]  # 1 para energía renovable, 0 para no renovable



### Crear un dataset de Hugging Face

In [4]:
dataset = Dataset.from_dict({"text": texts, "label": labels})

### Cargar el tokenizador y el modelo

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# c:\BootcampIA\bootcamp\Lib\site-packages\huggingface_hub\file_download.py:142: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\wiston.mazo\.cache\huggingface\hub\models--bert-base-uncased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
# To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
#   warnings.warn(message)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 ### Función de preprocesamiento

In [6]:
def tokenize_function(examples):
     return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 5/5 [00:00<00:00, 608.19 examples/s]


### Dividir el dataset

In [7]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

### Función de cálculo de métricas

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Configurar el entrenamiento

In [9]:
training_args = TrainingArguments(
     output_dir="./results",
     num_train_epochs=3,
     per_device_train_batch_size=16,
     per_device_eval_batch_size=64,
     warmup_steps=500,
     weight_decay=0.01,
     logging_dir='./logs',
     logging_steps=10,
     evaluation_strategy="epoch",
 )



### Crear el Trainer

In [10]:
trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=tokenized_dataset["train"],
     eval_dataset=tokenized_dataset["test"],
     compute_metrics=compute_metrics,
 )

### Entrenar el modelo

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.101567,0.0,0.0,0.0,0.0
2,No log,1.102141,0.0,0.0,0.0,0.0
3,No log,1.103454,0.0,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=3, training_loss=0.5296884377797445, metrics={'train_runtime': 31.8668, 'train_samples_per_second': 0.377, 'train_steps_per_second': 0.094, 'total_flos': 3157332664320.0, 'train_loss': 0.5296884377797445, 'epoch': 3.0})


### Evaluar el modelo

In [12]:
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 1.1034537553787231, 'eval_accuracy': 0.0, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.7599, 'eval_samples_per_second': 1.316, 'eval_steps_per_second': 1.316, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Hacer una predicción

In [21]:
text = "La energía geotérmica aprovecha el calor de la Tierra."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits).item()
print(f"Predicción: {'Energía Renovable' if prediction == 1 else 'Energía No Renovable'}")

Predicción: Energía Renovable
