# Bert_fake_news

Notebook para el entrenamiento del modelo BERT para la detección de noticias falsas

In [None]:
!pip install pandas numpy scikit-learn transformers torch imblearn tqdm

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, TFAutoModelForSequenceClassification, AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import os

In [None]:
df=pd.read_csv('../input/fake-or-real-news/fake_or_real_news.csv',index_col=[0]).reset_index(drop=True)
df.head()

#df_fake=pd.read_csv('/kaggle/input/fake-news-detection/fake.csv').reset_index(drop=True)
#df_fake['label'] = 0
#df_true=pd.read_csv('/kaggle/input/fake-news-detection/true.csv').reset_index(drop=True)
#df_true['label'] = 1

#df = pd.concat([df_fake,df_true], ignore_index=True)
#df = df.sample(frac = 0.5)


In [None]:
df['all_text']=df['title']+"\n"+df['text']
df.drop(['title','text'], axis = 1, inplace=True)

#df['all_text']=df['title']+"\n"+df['text']
#df.drop(['title','text', 'subject', 'date'], axis = 1, inplace=True)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df.head()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(df['all_text'],df['label'],test_size=0.2,stratify=df['label'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
class Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_dataset = Dataset(x_train.tolist(), y_train.tolist(), tokenizer)
val_dataset = Dataset(x_test.tolist(), y_test.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = True)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 5
train_loss_list, val_loss_list = [],[]

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        outputs = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device), labels=batch['labels'].to(device))

        train_loss += outputs.loss.item()
        outputs.loss.backward()
        optimizer.step()          
        optimizer.zero_grad() 

    print(f"Epoch {epoch+1} Train Loss: {train_loss/len(train_loader)}")
    train_loss_list.append(train_loss/len(train_loader))

    model.eval()
    val_loss = 0
    preds, true_pred = [],[]
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"): 
            outputs = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device), labels=batch['labels'].to(device))
            val_loss += outputs.loss.item()

            preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_pred.extend(batch['labels'].numpy())
    print(f"Epoch {epoch+1} Val Loss: {val_loss/len(val_loader)}")
    val_loss_list.append(val_loss/len(val_loader))
    print(classification_report(true_pred, preds, target_names=label_encoder.classes_))
            



    

In [None]:
# Graficar
plt.plot(range(1,epochs+1), train_loss_list, label='Training Loss')
plt.plot(range(1,epochs+1), val_loss_list, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
id2label = {0: "Negativo", 1: "Neutro", 2: "Positivo"}

def pred_sent(texto):
    # Tokenizar la entrada
    inputs = tokenizer(
            texto,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

    # Mover los tensores al dispositivo (CPU o GPU)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Desactivar gradientes para inferencia
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        pred_id = torch.argmax(logits, dim=1).item()

    #return id2label[pred_id]
    return pred_id


In [None]:
texto = "Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing"
sentimiento = pred_sent(texto)
print(f"Sentimiento: {sentimiento}")

In [None]:
df_test=pd.read_csv('/kaggle/input/fake-news-detection/fake.csv').reset_index(drop=True)
df_test = df_test.sample(frac=0.1 ).reset_index(drop=True)
df_test['label'] = 0
df_test['all_text']=df_test['title']+"\n"+df_test['text']
df_test = df_test[['all_text','label']]

df_test

In [None]:
x_test = x_test.reset_index(drop=True)
sent_list = []
for i in range(x_test.shape[0]):
    text = str(x_test[i])
    sent = pred_sent(text)
    sent_list.append(sent)

In [None]:
cm = confusion_matrix(y_test, sent_list)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

In [None]:
os.makedirs("/kaggle/working/bert_fake_news", exist_ok=True)
os.makedirs("/kaggle/working/bert_fake_news/model", exist_ok=True)
os.makedirs("/kaggle/working/bert_fake_news/tokenizer", exist_ok=True)

# Save model
model.save_pretrained('/kaggle/working/bert_fake_news/model/bert_fake_news_model', save_embedding_model=False)
tokenizer.save_pretrained('/kaggle/working/bert_fake_news/tokenizer/bert_tokenizer')