# ***Dependencias y bibliotecas***

## ***Dependencias***

In [1]:
!pip install codecarbon
!pip install hf_xet

Collecting codecarbon
  Downloading codecarbon-3.0.1-py3-none-any.whl.metadata (9.1 kB)
Collecting fief-client[cli] (from codecarbon)
  Downloading fief_client-0.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting questionary (from codecarbon)
  Downloading questionary-2.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting rapidfuzz (from codecarbon)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting httpx<0.28.0,>=0.21.3 (from fief-client[cli]->codecarbon)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jwcrypto<2.0.0,>=1.4 (from fief-client[cli]->codecarbon)
  Downloading jwcrypto-1.5.6-py3-none-any.whl.metadata (3.1 kB)
Collecting yaspin (from fief-client[cli]->codecarbon)
  Downloading yaspin-3.1.0-py3-none-any.whl.metadata (14 kB)
Collecting termcolor<2.4.0,>=2.2.0 (from yaspin->fief-client[cli]->codecarbon)
  Downloading termcolor-2.3.0-py3-none-any.whl.metadata (5.3 kB)
Downloading codecarbon

## ***Bibliotecas***

In [8]:
# Datasets
from datasets import load_dataset
import pandas as pd

# NLP
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer, TrainingArguments

# Math
import numpy as np 

# Metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Codecarbon
from codecarbon import EmissionsTracker

# Format
from rich.console import Console
from rich.table import Table

# Kaggle secrets 
from kaggle_secrets import UserSecretsClient

# Hugging face interface 
from huggingface_hub import HfApi, login, create_repo, upload_folder

# ***Carga de datos***

Proporciona ~2k mensajes de chat de partidas de Dota 2 con niveles de toxicidad (0=no tóxico, 1=leve, 2=alto).

In [3]:
ds = load_dataset("dffesalbon/dota-2-toxic-chat-data")
print(ds["train"][0])

train.csv:   0%|          | 0.00/43.3k [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/15.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1722 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/192 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/638 [00:00<?, ? examples/s]

{'message': 'can t win alone', 'target': 0}


# ***Cargando el modelo pre-entrenado***

In [4]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Carga del modelo RoBERTa para clasificación con 3 etiquetas
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=3
)

model.config.id2label = {
    0: "non-toxic",
    1: "mild toxicity",
    2: "toxic",
}

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["message"], 
        padding="max_length",
        truncation=True,
        max_length=128
    )
    tokenized["labels"] = examples["target"]  
    return tokenized

tokenized_ds = ds.map(tokenize_function, batched=True)

tokenized_ds = tokenized_ds.remove_columns(["message", "target"])
tokenized_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])  

Map:   0%|          | 0/1722 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

# ***Definición de métricas***

In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average="macro"),  # F1 promedio
        "report": classification_report(labels, predictions)         # Reporte completo
    }

# ***Entrenamiento***

In [9]:
training_args = TrainingArguments(
    output_dir="./dota2-toxicity-model",
    eval_strategy="epoch",         
    
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    
    learning_rate=2e-5,
    
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    
    num_train_epochs=7,
    
    save_strategy="epoch",          
    save_total_limit=1,
    
    logging_dir="./logs",
    report_to="none"                
)


tracker = EmissionsTracker(
    log_level="critical",  # Evita logs redundantes
    project_name="dota2_toxicity_classification",
    output_dir="."
)

try:
    # Inicia el tracking
    tracker.start()
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        compute_metrics=compute_metrics,
    )
    
    trainer.train()  # ¡Ahora debería funcionar
finally:
    # Detén el tracker y guarda el reporte
    emissions = tracker.stop()



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Report
1,No log,0.593163,0.729167,0.646792,precision recall f1-score support  0 0.90 0.91 0.90 104  1 0.52 0.54 0.53 41  2 0.52 0.49 0.51 47  accuracy 0.73 192  macro avg 0.65 0.65 0.65 192 weighted avg 0.73 0.73 0.73 192
2,No log,0.532727,0.755208,0.697756,precision recall f1-score support  0 0.94 0.87 0.90 104  1 0.54 0.73 0.62 41  2 0.62 0.53 0.57 47  accuracy 0.76 192  macro avg 0.70 0.71 0.70 192 weighted avg 0.78 0.76 0.76 192
3,No log,0.490796,0.776042,0.703246,precision recall f1-score support  0 0.92 0.93 0.93 104  1 0.57 0.73 0.64 41  2 0.65 0.47 0.54 47  accuracy 0.78 192  macro avg 0.71 0.71 0.70 192 weighted avg 0.78 0.78 0.77 192
4,No log,0.435335,0.817708,0.763088,precision recall f1-score support  0 0.95 0.93 0.94 104  1 0.65 0.68 0.67 41  2 0.68 0.68 0.68 47  accuracy 0.82 192  macro avg 0.76 0.77 0.76 192 weighted avg 0.82 0.82 0.82 192
5,No log,0.500135,0.833333,0.779991,precision recall f1-score support  0 0.94 0.95 0.95 104  1 0.68 0.68 0.68 41  2 0.72 0.70 0.71 47  accuracy 0.83 192  macro avg 0.78 0.78 0.78 192 weighted avg 0.83 0.83 0.83 192
6,No log,0.499675,0.822917,0.766731,precision recall f1-score support  0 0.95 0.93 0.94 104  1 0.71 0.59 0.64 41  2 0.66 0.79 0.72 47  accuracy 0.82 192  macro avg 0.77 0.77 0.77 192 weighted avg 0.83 0.82 0.82 192
7,No log,0.524036,0.828125,0.777669,precision recall f1-score support  0 0.95 0.93 0.94 104  1 0.68 0.68 0.68 41  2 0.69 0.72 0.71 47  accuracy 0.83 192  macro avg 0.78 0.78 0.78 192 weighted avg 0.83 0.83 0.83 192




In [10]:
df = pd.read_csv("emissions.csv")

# Extrae un solo registro (el último)
row = df.iloc[-1]

# 3) Compute additional metrics (convert kg to g, kWh to Wh)
emissions_g = row['emissions'] * 1000                      # kg CO2e -> g CO2e
energy_consumed_wh = row['energy_consumed'] * 1000         # kWh -> Wh
cpu_energy_wh = row['cpu_energy'] * 1000                   # kWh -> Wh
gpu_energy_wh = row['gpu_energy'] * 1000                   # kWh -> Wh
ram_energy_wh = row['ram_energy'] * 1000                   # kWh -> Wh

# 4) Build and display the Rich table
table = Table(title="💨 CodeCarbon Emissions & Resource Report")

table.add_column("Metric", style="cyan", no_wrap=True)
table.add_column("Value", style="magenta", justify="right")

table.add_row("Duration (s)", f"{row['duration']:.1f}")
table.add_row("CPU Energy (Wh)", f"{cpu_energy_wh:.2f}")
table.add_row("GPU Energy (Wh)", f"{gpu_energy_wh:.2f}")
table.add_row("RAM Energy (Wh)", f"{ram_energy_wh:.2f}")
table.add_row("Total Energy (Wh)", f"{energy_consumed_wh:.2f}")
table.add_row("Emissions (g CO₂e)", f"{emissions_g:.2f}")
table.add_row("Emissions Rate (kg CO₂e/kWh)", f"{row['emissions_rate']:.4f}")
table.add_row("Tracking Mode", row['tracking_mode'])
table.add_row("CPU Model", row['cpu_model'])
table.add_row("CPU Count", str(int(row['cpu_count'])))
table.add_row("GPU Model", row['gpu_model'] or "N/A")
table.add_row("GPU Count", str(int(row['gpu_count'])))
table.add_row("RAM Total (GB)", f"{row['ram_total_size'] / 1024:.2f}")
table.add_row("Region", row['region'])
table.add_row("PUE", f"{row['pue']:.2f}")

console = Console()
console.print(table)

# ***Evaluación***

In [11]:
# Evaluación final:
metrics = trainer.evaluate(tokenized_ds["test"])
console = Console()
table = Table(title="📊 Métricas de Evaluación - Toxicidad en Dota 2", show_header=True, header_style="bold magenta")

# Columnas
table.add_column("Métrica", style="cyan", width=20)
table.add_column("Valor", style="green", justify="right")

# Agregar filas
table.add_row("Pérdida (Loss)", f"{metrics['eval_loss']:.4f}")
table.add_row("Accuracy", f"{metrics['eval_accuracy']:.2%}")
table.add_row("F1 Macro", f"{metrics['eval_f1_macro']:.2%}")

# Imprimir tabla
console.print(table)

# Imprimir el reporte de clasificación con formato
console.print("\n[bold]📝 Reporte de Clasificación:[/bold]")
console.print(metrics["eval_report"])



# ***Aplicación***

In [12]:
classifier = pipeline(
    "text-classification", 
    model=model,
    tokenizer=tokenizer,
    function_to_apply="softmax",  # Para probabilidades
)

Device set to use cuda:0


In [13]:
chat_message = "You're a noob, uninstall the game!"

# Predecir
pred = classifier(chat_message)

table = Table(title=f'Resultados de Clasificación para "{chat_message}"')
table.add_column("Etiqueta", style="cyan", justify="left")
table.add_column("Probabilidad", style="magenta", justify="right")

for entry in pred:
    label = entry["label"]
    score = entry["score"]
    # Resalta la etiqueta con mayor probabilidad
    if entry == max(pred, key=lambda x: x["score"]):
        table.add_row(f"[bold]{label}[/bold]", f"[bold]{score:.4f}[/bold]")
    else:
        table.add_row(label, f"{score:.2f}")

# Mostrar tabla
console = Console()
console.print(table)

# ***App in Huggin Face***

## ***Guardamos el modelo***

In [14]:
trainer.save_model("dota2-toxic-detector")

## ***Subimos el modelo a HF***

In [15]:
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_PLN")  # Usa el nombre de tu secreto
login(token=hf_token)

In [16]:
!huggingface-cli login --token {hf_token}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `PLN_class` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `PLN_class`


In [18]:
model.push_to_hub("fwgalde/dota2-toxic-detector")
tokenizer.push_to_hub("fwgalde/dota2-toxic-detector")

README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/fwgalde/dota2-toxic-detector/commit/8f2e2eda4b93109e0027640589aed0d6ee376376', commit_message='Upload tokenizer', commit_description='', oid='8f2e2eda4b93109e0027640589aed0d6ee376376', pr_url=None, repo_url=RepoUrl('https://huggingface.co/fwgalde/dota2-toxic-detector', endpoint='https://huggingface.co', repo_type='model', repo_id='fwgalde/dota2-toxic-detector'), pr_revision=None, pr_num=None)

In [19]:
api = HfApi()
api.upload_folder(
  folder_path="dota2-toxic-detector",
  repo_id="fwgalde/dota2-toxic-detector",
  repo_type="model",
  create_pr=True
)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/fwgalde/dota2-toxic-detector/commit/676b8ae3eeb46f32896a1cbafe4e82e3b577257d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='676b8ae3eeb46f32896a1cbafe4e82e3b577257d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/fwgalde/dota2-toxic-detector', endpoint='https://huggingface.co', repo_type='model', repo_id='fwgalde/dota2-toxic-detector'), pr_revision=None, pr_num=None)