In [5]:
!pip  install mlflow evaluate



In [6]:
from google.colab import drive
drive.mount('/content/drive')

import mlflow, os
TRACKING_DIR = "/content/drive/MyDrive/class_runs"
os.makedirs(TRACKING_DIR, exist_ok=True)
mlflow.set_tracking_uri(f"file:{TRACKING_DIR}")
mlflow.set_experiment("task_rhetorical_sections_class")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<Experiment: artifact_location='file:///content/drive/MyDrive/class_runs/218580141895279385', creation_time=1772033202453, experiment_id='218580141895279385', last_update_time=1772033202453, lifecycle_stage='active', name='task_rhetorical_sections_class', tags={}, workspace='default'>

In [9]:
import pandas as pd

LABELS = ["INTRO","BACK","METH","RESU","DISC","CONTR","LIM","CONC"]
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# Ajusta rutas (sube tus parquets a Colab o móntalos desde Drive)
train_path = "/content/task1_retorica.parquet"
gold_path  = "/content/task1_gold.parquet"

train = pd.read_parquet(train_path)
gold  = pd.read_parquet(gold_path)

# Normaliza RES -> RESU y limpia strings
print(train.columns)

train = train[train["label"].isin(LABELS)].copy()
gold  = gold[gold["label"].isin(LABELS)].copy()

train["label_id"] = train["label"].map(label2id)
gold["label_id"]  = gold["label"].map(label2id)

print("train rows:", len(train))
print("gold rows:", len(gold))
print("train label counts:\n", train["label"].value_counts())
print("gold label counts:\n", gold["label"].value_counts())

Index(['chunk_id', 'doc_id', 'source_path', 'label', 'heading', 'n_words',
       'text'],
      dtype='object')
train rows: 17516
gold rows: 800
train label counts:
 label
INTRO    3000
CONC     3000
BACK     3000
RESU     3000
METH     3000
DISC     1524
CONTR     544
LIM       448
Name: count, dtype: int64
gold label counts:
 label
BACK     254
RESU     199
METH     145
DISC     107
INTRO     45
CONC      35
CONTR     11
LIM        4
Name: count, dtype: int64


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

def train_eval_sklearn(X_train, y_train, X_eval, y_eval):
    model = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=200000)),
        ("clf", LinearSVC())
    ])
    model.fit(X_train, y_train)
    preds = model.predict(X_eval)

    log_eval_to_mlflow(
        y_eval, preds,
        run_name="tfidf_linearsvc",
        extra_params={"model":"tfidf_linearsvc","max_features":200000,"ngram":"1-2"}
    )

    # log model
    import mlflow.sklearn
    with mlflow.start_run(run_name="tfidf_linearsvc_model_log"):
        mlflow.sklearn.log_model(model, artifact_path="model")

In [15]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def log_eval_to_mlflow(y_true, y_pred, labels, run_name, params=None, artifacts_prefix=""):
    import mlflow

    report = classification_report(y_true, y_pred, labels=labels, output_dict=True, zero_division=0)
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = report["macro avg"]["f1-score"]
    weighted_f1 = report["weighted avg"]["f1-score"]

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)

    mlflow.set_tag("task", "task1_rhetorical_sections")

    if params:
        mlflow.log_params(params)

    mlflow.log_metric("accuracy", float(acc))
    mlflow.log_metric("macro_f1", float(macro_f1))
    mlflow.log_metric("weighted_f1", float(weighted_f1))

    per_class_f1 = {lab: float(report[lab]["f1-score"]) for lab in labels}
    mlflow.log_dict(per_class_f1, f"{artifacts_prefix}per_class_f1.json")

    cm_path = "confusion_matrix.csv"
    cm_df.to_csv(cm_path, index=True)
    mlflow.log_artifact(cm_path, artifact_path=f"{artifacts_prefix}artifacts")

    # optional: full report
    mlflow.log_dict(report, f"{artifacts_prefix}classification_report.json")

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import mlflow
import mlflow.sklearn

X_train, y_train = train["text"].astype(str), train["label"]
X_gold,  y_gold  = gold["text"].astype(str),  gold["label"]

with mlflow.start_run(run_name="tfidf_linearsvc"):
    model = Pipeline([
        ("tfidf", TfidfVectorizer(
            ngram_range=(1,2),
            min_df=2,
            max_features=200000,
            strip_accents="unicode",
            lowercase=True
        )),
        ("clf", LinearSVC())
    ])

    model.fit(X_train, y_train)
    preds = model.predict(X_gold)

    log_eval_to_mlflow(
        y_true=y_gold,
        y_pred=preds,
        labels=LABELS,
        run_name="tfidf_linearsvc",
        params={"model":"tfidf_linearsvc","max_features":200000,"ngram":"1-2"}
    )

    mlflow.sklearn.log_model(model, artifact_path="model")



In [10]:
import mlflow
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer,
    DataCollatorWithPadding
)
import evaluate

metric_f1 = evaluate.load("f1")
metric_acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "weighted_f1": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

def train_transformer(model_name, run_name, max_length=256, lr=2e-5, epochs=2, batch_size=16):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


    def tok(batch):
        return tokenizer(batch["text"], truncation=True, max_length=max_length)

    ds_train = Dataset.from_pandas(train[["text","label_id"]].rename(columns={"label_id":"labels"}))
    ds_eval  = Dataset.from_pandas(gold[["text","label_id"]].rename(columns={"label_id":"labels"}))
    ds_train = ds_train.map(tok, batched=True)
    ds_eval  = ds_eval.map(tok, batched=True)

    # (opcional pero recomendado) evitar columnas extra que molesten al trainer
    ds_train = ds_train.remove_columns([c for c in ds_train.column_names if c not in ["input_ids", "attention_mask", "labels"]])
    ds_eval  = ds_eval.remove_columns([c for c in ds_eval.column_names  if c not in ["input_ids", "attention_mask", "labels"]])

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(LABELS),
        label2id=label2id,
        id2label=id2label
    )

    export_dir = f"./export_{run_name}"

    args = TrainingArguments(
        output_dir=f"./out_{run_name}",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        save_strategy="no",
        logging_steps=50,
        report_to=[],  # we log to MLflow ourselves
    )

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params({
            "model": "transformer",
            "model_name": model_name,
            "max_length": max_length,
            "lr": lr,
            "epochs": epochs,
            "batch_size": batch_size
        })

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=ds_train,
            eval_dataset=ds_eval,
            data_collator=data_collator,
            processing_class=tokenizer,   # <- reemplaza tokenizer=
            compute_metrics=compute_metrics,
        )

        trainer.train()
        trainer.save_model(str(export_dir))
        tokenizer.save_pretrained(str(export_dir))
        metrics = trainer.evaluate()
        mlflow.log_artifacts(local_dir=export_dir,artifact_path="model")


        for k, v in metrics.items():
          if isinstance(v, (int, float)):
              mlflow.log_metric(k, float(v))


        # normalized keys
        if "eval_accuracy" in metrics:
            mlflow.log_metric("accuracy", float(metrics["eval_accuracy"]))
        if "eval_macro_f1" in metrics:
            mlflow.log_metric("macro_f1", float(metrics["eval_macro_f1"]))
        if "eval_weighted_f1" in metrics:
            mlflow.log_metric("weighted_f1", float(metrics["eval_weighted_f1"]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
expoirt_path="/content/bert_spanish_wwm"
run_name = "bert_spanish_wwm"
with mlflow.start_run(run_name="bert_spanish_wwm"):
  mlflow.log_artifacts(local_dir=expoirt_path,artifact_path="model")


In [12]:
# Modelo A (RoBERTa BNE)
train_transformer(
    model_name="Buzzeitor/roberta-base-bne",
    run_name="roberta_bne",
    max_length=256,
    lr=2e-5,
    epochs=2,
    batch_size=16
)



config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Map:   0%|          | 0/17516 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: Buzzeitor/roberta-base-bne
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.decoder.weight          | UNEXPECTED | 
lm_head.decoder.bias            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.weight         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Step,Training Loss
50,1.895636
100,1.759675
150,1.668595
200,1.615359
250,1.585815
300,1.539
350,1.516781
400,1.511824
450,1.451426
500,1.397082


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
# Modelo B (BERT Spanish WWM)
train_transformer(
    model_name="dccuchile/bert-base-spanish-wwm-cased",
    run_name="bert_spanish_wwm",
    max_length=256,
    lr=2e-5,
    epochs=2,
    batch_size=16
)

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

Map:   0%|          | 0/17516 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: dccuchile/bert-base-spanish-wwm-cased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.decoder.bias               | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
bert.pooler.dense.weight                   | MISSING    | 
bert.pooler.dense.bias                     | MISSING    | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not 

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Step,Training Loss
50,1.843277
100,1.738343
150,1.659067
200,1.620247
250,1.579425
300,1.516345
350,1.504085
400,1.520646
450,1.448704
500,1.39415


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
import os, torch

# (Opcional) Si tienes más de una GPU, esto elige la 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

if device.type == "cuda":
    print("CUDA:", torch.version.cuda)
    print("GPU:", torch.cuda.get_device_name(0))
    print("VRAM (GB):", round(torch.cuda.get_device_properties(0).total_memory / 1e9, 2))
    # Acelera matmul en GPUs modernas (opcional)
    try:
        torch.set_float32_matmul_precision("high")
    except Exception:
        pass
train_transformer(
    model_name="Flaglab/SciBETO-large",
    run_name="scibeto_large",
    max_length=256,
    lr=2e-5,
    epochs=2,
    batch_size=16
)

Device: cuda
CUDA: 12.8
GPU: Tesla T4
VRAM (GB): 15.64


config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/378 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Map:   0%|          | 0/17516 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: Flaglab/SciBETO-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Step,Training Loss
50,1.861913
100,1.632993
150,1.516624
200,1.404205
250,1.398626
300,1.33624
350,1.314664
400,1.250153
450,1.162264
500,1.158689


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]