In [1]:
import pandas as pd
import torch
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
gossipcop_fake = pd.read_csv("../dataset/gossipcop_fake.csv")
gossipcop_fake['label'] = 1

gossipcop_real = pd.read_csv("../dataset/gossipcop_real.csv")
gossipcop_real['label'] = 0

politifact_fake = pd.read_csv("../dataset/politifact_fake.csv")
politifact_fake['label'] = 1

politifact_real = pd.read_csv("../dataset/politifact_real.csv")
politifact_real['label'] = 0

In [3]:
os.makedirs("./results/gossipcop", exist_ok=True)
os.makedirs("./results/politifact", exist_ok=True)
os.makedirs("./results/combined", exist_ok=True)
os.makedirs("./logs", exist_ok=True)

In [4]:
df_gossipcop = pd.concat([gossipcop_fake, gossipcop_real], ignore_index=True)
df_politifact = pd.concat([politifact_fake, politifact_real], ignore_index=True)
df_combined = pd.concat([gossipcop_fake, gossipcop_real, politifact_fake, politifact_real], ignore_index=True)

df_gossipcop = df_gossipcop[['title', 'label']]
df_politifact = df_politifact[['title', 'label']]
df_combined = df_combined[['title', 'label']]

# Save the combined dataset
df_gossipcop.to_csv("gossipcop_dataset.csv", index=False)
df_politifact.to_csv("politifact_dataset.csv", index=False)
df_combined.to_csv("fakenews_combined.csv", index=False)

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
from torch.utils.data import Dataset
import torch

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [5]:
def prepare_datasets(df, tokenizer):
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['title'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
    )
    
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)
    
    train_dataset = NewsDataset(train_encodings, train_labels)
    val_dataset = NewsDataset(val_encodings, val_labels)
    
    return train_dataset, val_dataset


In [8]:
def train_model(train_dataset, val_dataset, output_dir, model_name):
    # Initialize model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    
    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_dir="./logs",
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        report_to="none"  # Disable reporting to avoid wandb etc.
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )
    
    # Train model
    print(f"Training model: {model_name}")
    trainer.train()
    
    # Save model
    model.save_pretrained(f"{output_dir}/{model_name}")
    tokenizer.save_pretrained(f"{output_dir}/{model_name}")
    
    # Evaluate model
    eval_result = trainer.evaluate()
    print(f"Evaluation results for {model_name}:")
    print(eval_result)
    
    return model, eval_result

In [9]:
# Prepare datasets
print("Preparing GossipCop datasets...")
gossipcop_train_dataset, gossipcop_val_dataset = prepare_datasets(df_gossipcop)

print("Preparing PolitiFact datasets...")
politifact_train_dataset, politifact_val_dataset = prepare_datasets(df_politifact)

print("Preparing combined datasets...")
combined_train_dataset, combined_val_dataset = prepare_datasets(df_combined)

Preparing GossipCop datasets...
Preparing PolitiFact datasets...
Preparing combined datasets...


In [10]:
gossipcop_model, gossipcop_results = train_model(
    gossipcop_train_dataset, 
    gossipcop_val_dataset, 
    "./results/gossipcop", 
    "gossipcop_model"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model: gossipcop_model


Epoch,Training Loss,Validation Loss
1,0.3429,0.381411
2,0.2736,0.476984
3,0.1719,0.553201


Evaluation results for gossipcop_model:
{'eval_loss': 0.3814108967781067, 'eval_runtime': 167.9603, 'eval_samples_per_second': 26.363, 'eval_steps_per_second': 3.298, 'epoch': 3.0}


In [11]:
politifact_model, politifact_results = train_model(
    politifact_train_dataset, 
    politifact_val_dataset, 
    "./results/politifact", 
    "politifact_model"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model: politifact_model


Epoch,Training Loss,Validation Loss
1,0.4295,0.39925
2,0.1993,0.572944
3,0.0459,0.644151


Evaluation results for politifact_model:
{'eval_loss': 0.3992496132850647, 'eval_runtime': 8.7794, 'eval_samples_per_second': 24.148, 'eval_steps_per_second': 3.075, 'epoch': 3.0}


In [12]:
combined_model, combined_results = train_model(
    combined_train_dataset, 
    combined_val_dataset, 
    "./results/combined", 
    "combined_model"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model: combined_model


Epoch,Training Loss,Validation Loss
1,0.4249,0.388743
2,0.3611,0.400736
3,0.2318,0.509931


Evaluation results for combined_model:
{'eval_loss': 0.388743132352829, 'eval_runtime': 138.0045, 'eval_samples_per_second': 33.622, 'eval_steps_per_second': 4.203, 'epoch': 3.0}


In [13]:
results = {
    "GossipCop Model": gossipcop_results,
    "PolitiFact Model": politifact_results,
    "Combined Model": combined_results
}

print("\nSummary of training results:")
for model_name, result in results.items():
    print(f"{model_name}: Loss = {result['eval_loss']:.4f}")

print("\nTraining complete. Models saved in the following directories:")
print("- GossipCop model: ./results/gossipcop/gossipcop_model")
print("- PolitiFact model: ./results/politifact/politifact_model")
print("- Combined model: ./results/combined/combined_model")


Summary of training results:
GossipCop Model: Loss = 0.3814
PolitiFact Model: Loss = 0.3992
Combined Model: Loss = 0.3887

Training complete. Models saved in the following directories:
- GossipCop model: ./results/gossipcop/gossipcop_model
- PolitiFact model: ./results/politifact/politifact_model
- Combined model: ./results/combined/combined_model


In [1]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [3]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer

politifact_model = BertForSequenceClassification.from_pretrained("./results/politifact/politifact_model")
politifact_tokenizer = BertTokenizer.from_pretrained("./results/politifact/politifact_model")

Gossipcop_model = BertForSequenceClassification.from_pretrained("./results/gossipcop/gossipcop_model")
Gossipcop_tokenizer = BertTokenizer.from_pretrained("./results/gossipcop/gossipcop_model")

combined_model = BertForSequenceClassification.from_pretrained("./results/combined/combined_model")
combined_tokenizer = BertTokenizer.from_pretrained("./results/combined/combined_model")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import pandas as pd 
from sklearn.model_selection import train_test_split

politifact_df = pd.read_csv("../dataset/politifact_dataset.csv")
_, val_dataset = prepare_datasets(politifact_df, politifact_tokenizer)

gossipcop_df = pd.read_csv("../dataset/gossipcop_dataset.csv")
_, val_dataset = prepare_datasets(gossipcop_df, Gossipcop_tokenizer)

combined_df = pd.read_csv("../dataset/combined.csv")
_, val_dataset = prepare_datasets(combined_df, combined_tokenizer)

In [8]:

politifact_trainer = Trainer(
    model=politifact_model,
    tokenizer=politifact_tokenizer
)

predictions = politifact_trainer.predict(val_dataset)
metrics = compute_metrics(predictions)
print(metrics)

combined_trainer = Trainer(
    model=combined_model,
    tokenizer=combined_tokenizer
)

predictions = combined_trainer.predict(val_dataset)
metrics = compute_metrics(predictions)
print(metrics)

Gossipcop_trainer = Trainer(
    model=Gossipcop_model,
    tokenizer=Gossipcop_tokenizer
)

predictions = Gossipcop_trainer.predict(val_dataset)
metrics = compute_metrics(predictions)
print(metrics)


  politifact_trainer = Trainer(


{'accuracy': 0.2959051724137931, 'f1': 0.40263302249040045, 'precision': 0.2548021291367739, 'recall': 0.9590592334494773}


  combined_trainer = Trainer(


{'accuracy': 0.8424568965517242, 'f1': 0.6663623916020082, 'precision': 0.6999041227229147, 'recall': 0.6358885017421603}


  Gossipcop_trainer = Trainer(


{'accuracy': 0.8752155172413794, 'f1': 0.7106446776611695, 'precision': 0.8335287221570926, 'recall': 0.6193379790940766}
