<a href="https://colab.research.google.com/github/jeramirez169/DataScience_SGS_Classification/blob/main/models/03_tranformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Clonar tu repositorio desde GitHub
!git clone https://github.com/jeramirez169/DataScience_SGS_Classification.git
%cd DataScience_SGS_Classification

# Instalar dependencias necesarias
!pip install -q pandas numpy scikit-learn unidecode spacy imbalanced-learn transformers
!python -m spacy download es_core_news_lg


fatal: destination path 'DataScience_SGS_Classification' already exists and is not an empty directory.
/content/DataScience_SGS_Classification
Collecting es-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.8.0/es_core_news_lg-3.8.0-py3-none-any.whl (568.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.0/568.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
# Ir a la carpeta data
%cd data

# Descomprimir el archivo ZIP
!unzip "Dataset_SGS_clean.zip" -d .

# Regresar al directorio raíz del proyecto
%cd ..


/content/DataScience_SGS_Classification/data
Archive:  Dataset_SGS_clean.zip
replace ./Dataset_SGS_clean.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: /content/DataScience_SGS_Classification


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import torch
import os

In [None]:
ruta = "data/Dataset_SGS_clean.csv"
df = pd.read_csv(ruta, encoding="utf-8")

df = df[['Oficina', 'texto_truncado']].rename(columns={
    'Oficina': 'label_text',
    'texto_truncado': 'text'
})

label2id = {label: idx for idx, label in enumerate(sorted(df['label_text'].unique()))}
id2label = {v: k for k, v in label2id.items()}
df['label'] = df['label_text'].map(label2id)

print("Etiquetas y códigos:", label2id)
print("Total de muestras:", len(df))




In [None]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

print(f"Tamaño entrenamiento: {len(df_train)}")
print(f"Tamaño prueba: {len(df_test)}")



In [None]:
#Tokenización
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-uncased"
MAX_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = Dataset.from_pandas(df_train)
test_ds = Dataset.from_pandas(df_test)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "label"]
train_ds.set_format("torch", columns=cols)
test_ds.set_format("torch", columns=cols)


In [None]:
#Cargar modelo BETO
NUM_LABELS = len(label2id)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)


In [None]:
#Calcular pesos de clase (para desbalance)
counts = df_train['label'].value_counts().sort_index().values
class_weights = torch.tensor(
    (counts.sum() / (NUM_LABELS * counts)),
    dtype=torch.float
)
print("Pesos de clase:", class_weights)


In [None]:
#Función de perdida
def custom_loss(model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
    loss = loss_fn(logits, labels)
    return (loss, outputs) if return_outputs else loss


In [None]:
#Métricas
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    macro_f1 = f1_score(labels, preds, average="macro")
    weighted_f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "weighted_f1": weighted_f1
    }

In [None]:
args = TrainingArguments(
    output_dir="./beto_sgs_80_20",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    logging_steps=100,
    warmup_ratio=0.1,
    weight_decay=0.01,
    seed=42
)


In [None]:
#Entrenamiento
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.compute_loss = custom_loss
trainer.train()



In [None]:
#Evaluación
metrics_test = trainer.evaluate(test_ds)
print("\nResultados en conjunto de prueba (20%):")
for k, v in metrics_test.items():
    print(f"{k}: {v:.4f}")



In [None]:
#Matriz de confusión
preds = trainer.predict(test_ds)
y_true = preds.label_ids
y_pred = preds.predictions.argmax(-1)

cm = confusion_matrix(y_true, y_pred, normalize='true')

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt=".2f",
            xticklabels=id2label.values(),
            yticklabels=id2label.values())
plt.xlabel("Predicción")
plt.ylabel("Etiqueta real")
plt.title("Matriz de Confusión - BETO SGS (80/20)")
plt.show()

