In [1]:
!pip install -q transformers datasets accelerate

In [3]:
# ============================================
# 2. Importar librerías
# ============================================
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments)

In [4]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [5]:
# ============================================
# 3. Verificar GPU
# ============================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Dispositivo en uso:", device)
!nvidia-smi | grep "NVIDIA"

✅ Dispositivo en uso: cuda
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |


In [6]:
# ============================================
# 4. Cargar dataset desde Google Drive
# ============================================
from google.colab import drive
drive.mount('/content/drive')

# Reemplaza con la ruta de tu archivo
path = '/content/drive/MyDrive/dataset_concatenado.csv'

# Leer el CSV (manejo de codificación)
try:
    df = pd.read_csv(path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(path, encoding='latin1', on_bad_lines='skip')

print("✅ Dataset cargado. Filas:", len(df))
print(df.head()


Mounted at /content/drive
✅ Dataset cargado. Filas: 59766
                                              titulo  \
0  Moreno intenta apaciguar el flanco sanitario m...   
1  La Abogacía del Estado se retira como acusació...   
2  Las promesas incumplidas de Pablo Echenique en...   
3  Sánchez defiende 'resolver el problema' de la ...   
4  Ian Gibson cierra la lista electoral de la con...   

                                               texto      clase  
0  El presidente abre la puerta a unos comicios e...  verdadera  
1  En un escrito, la abogada del Estado Rosa Marí...  verdadera  
2  Este lunes y martes la Asamblea de Madrid acog...      falsa  
3  Resulta evidente que la ley ha tenido algunos ...  verdadera  
4  El hispanista, que ya ocupó un puesto simbólic...  verdadera  


In [7]:
df.head()

Unnamed: 0,titulo,texto,clase
0,Moreno intenta apaciguar el flanco sanitario m...,El presidente abre la puerta a unos comicios e...,verdadera
1,La Abogacía del Estado se retira como acusació...,"En un escrito, la abogada del Estado Rosa Marí...",verdadera
2,Las promesas incumplidas de Pablo Echenique en...,Este lunes y martes la Asamblea de Madrid acog...,falsa
3,Sánchez defiende 'resolver el problema' de la ...,Resulta evidente que la ley ha tenido algunos ...,verdadera
4,Ian Gibson cierra la lista electoral de la con...,"El hispanista, que ya ocupó un puesto simbólic...",verdadera


In [13]:
# ============================================
# 5. Preparar dataset (titulo + texto + clase)
# ============================================
text_col = 'texto'
title_col = 'titulo'
label_col = 'clase'

# Combinar título y texto para más contexto
df = df[[title_col, text_col, label_col]].dropna()
df['texto_completo'] = df[title_col] + " " + df[text_col]
df.head()


Unnamed: 0,titulo,texto,clase,texto_completo
0,Moreno intenta apaciguar el flanco sanitario m...,El presidente abre la puerta a unos comicios e...,verdadera,Moreno intenta apaciguar el flanco sanitario m...
1,La Abogacía del Estado se retira como acusació...,"En un escrito, la abogada del Estado Rosa Marí...",verdadera,La Abogacía del Estado se retira como acusació...
2,Las promesas incumplidas de Pablo Echenique en...,Este lunes y martes la Asamblea de Madrid acog...,falsa,Las promesas incumplidas de Pablo Echenique en...
3,Sánchez defiende 'resolver el problema' de la ...,Resulta evidente que la ley ha tenido algunos ...,verdadera,Sánchez defiende 'resolver el problema' de la ...
4,Ian Gibson cierra la lista electoral de la con...,"El hispanista, que ya ocupó un puesto simbólic...",verdadera,Ian Gibson cierra la lista electoral de la con...


In [9]:
# Convertir etiquetas a números
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df[label_col])


In [10]:
# Dividir en train/test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])


In [11]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [12]:
print("✅ Datos listos para tokenización.")
print(df[label_col].value_counts())
print("\nEjemplo de texto combinado:\n")
print(df['texto_completo'].iloc[0][:400], "...")

✅ Datos listos para tokenización.
clase
verdadera    33348
falsa        23880
Name: count, dtype: int64

Ejemplo de texto combinado:

Moreno intenta apaciguar el flanco sanitario mientras enreda con la fecha de las elecciones El presidente abre la puerta a unos comicios en junio que no sean en domingo. ...


In [15]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
# ============================================
# Tokenizar los textos
# ============================================
model_name = "bert-base-multilingual-cased"  # Soporta español
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch['texto_completo'],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_tokenized = train_dataset.map(tokenize, batched=True, batch_size=64)
test_tokenized = test_dataset.map(tokenize, batched=True, batch_size=64)

train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/45782 [00:00<?, ? examples/s]

Map:   0%|          | 0/11446 [00:00<?, ? examples/s]

In [17]:
# ============================================
# 7️Cargar modelo preentrenado
# ============================================
num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to("cuda")


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# ============================================
# 8️⃣ Definir métricas
# ============================================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds, average='weighted'),
        'recall': recall_score(labels, preds, average='weighted'),
        'f1': f1_score(labels, preds, average='weighted')
    }

In [20]:
# ============================================
# Configurar entrenamiento
# ============================================
training_args = TrainingArguments(
    output_dir="./fake_news_model",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    learning_rate=2e-5,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    compute_metrics=compute_metrics
)

In [21]:
# ============================================
# Entrenar el modelo
# ============================================
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjazska[0m ([33mjazska-universidad-de-antioquia[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.3751
1000,0.1836
1500,0.169
2000,0.1404
2500,0.1376
3000,0.1084
3500,0.095
4000,0.0926
4500,0.0854
5000,0.0815


TrainOutput(global_step=5724, training_loss=0.13817214999142266, metrics={'train_runtime': 1225.8466, 'train_samples_per_second': 74.695, 'train_steps_per_second': 4.669, 'total_flos': 1.204575033649152e+16, 'train_loss': 0.13817214999142266, 'epoch': 2.0})

In [25]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)
import numpy as np



In [26]:
# ============================================
# Evaluar desempeño
# ============================================
metrics = trainer.evaluate()
print("📊 Resultados del modelo:", metrics)

📊 Resultados del modelo: {'eval_loss': 0.11220121383666992, 'eval_accuracy': 0.9721300017473353, 'eval_precision': 0.9722115829556264, 'eval_recall': 0.9721300017473353, 'eval_f1': 0.9720830187801806, 'eval_runtime': 36.0066, 'eval_samples_per_second': 317.887, 'eval_steps_per_second': 19.885, 'epoch': 2.0}


In [27]:
trainer.save_model("/content/fake_news_detector_model")
print("✅ Modelo guardado en /content/fake_news_detector_model")

✅ Modelo guardado en /content/fake_news_detector_model


In [36]:
from huggingface_hub import login, create_repo, upload_folder


login()

repo_id = "jazska/fake-news-detector-es"


create_repo(repo_id=repo_id, private=False)


trainer.save_model("modelo_fake_news")
tokenizer.save_pretrained("modelo_fake_news")


upload_folder(
    folder_path="modelo_fake_news",
    repo_id=repo_id,
    commit_message="Subida inicial del modelo de detección de fake news"
)




Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ke_news/model.safetensors:   0%|          |  555kB /  711MB            

  ...ke_news/training_args.bin:   3%|3         |   200B / 5.78kB            

CommitInfo(commit_url='https://huggingface.co/jazska/fake-news-detector-es/commit/44f006091406fb7331521a84e2a7176f880bbc12', commit_message='Subida inicial del modelo de detección de fake news', commit_description='', oid='44f006091406fb7331521a84e2a7176f880bbc12', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jazska/fake-news-detector-es', endpoint='https://huggingface.co', repo_type='model', repo_id='jazska/fake-news-detector-es'), pr_revision=None, pr_num=None)