In [1]:
# IMPORTS

import pandas as pd
import numpy as np
import time
import model_utils 

import torch
from torchinfo import summary
import torch.optim as optim
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset, random_split
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# CONSTANTS

TEST_PCT = 0.1
MAX_LENGTH = 256
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5
EPSILON = 1e-8

In [3]:
# NORMALIZE AND CONCAT ALL DATASETS

normalized_dfs = [
    # Misinfo Dataset
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_FAKE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_TRUE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/EXTRA_RussianPropagandaSubset.csv"),
    # Fake News Net Dataset
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_real.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_real.csv"),
    # Liar Dataset
    model_utils.normalize_liar_dataset("datasets/liar-dataset/train.tsv")
]

df = pd.concat(normalized_dfs, ignore_index=True)
df = df.dropna(subset=["text"]).reset_index(drop=True)

In [4]:
# CREATE DATALOADERS

def make_data_loader(df: pd.DataFrame, tokenizer, batch_size: int = BATCH_SIZE, test_pct: float = TEST_PCT) -> tuple:
    """
    Converts a dataframe of text and label columns into training and test dataloaders
    
    Parameters:
        df: The pandas DataFrame of text and label columns
        tokenizer: The Hugging-Face tokenizer to be used with our model
        batch_size: The number of input sequences processed
        test_pct: The percentage of data to be reserved for testing
    """
    
    # Grab features and label columns
    X = df["text"].tolist()
    y = df["label"].astype(int).tolist()
    
    # Tokenize 
    encoder = tokenizer(X, truncation=True, return_tensors='pt', padding="max_length", max_length=MAX_LENGTH)
    
    # Wrap into TensorDataset
    dataset = TensorDataset(encoder["input_ids"], encoder["attention_mask"], torch.tensor(y))
    
    # Train/Test Split
    dataset_length = len(dataset)
    test_length = int(test_pct * dataset_length)
    train_length = dataset_length - test_length
    
    train_data, test_data = random_split(dataset, [train_length, test_length])
    
    # Build DataLoaders
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    
    return (train_dataloader, test_dataloader)

In [5]:
# GRAB BERT TOKENIZER AND MODEL INFORMATION

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# INSTANTIATE DATALOADERS

train_loader, test_loader = make_data_loader(df, tokenizer)

In [7]:
# TRAIN BERT MODEL

# Utilize mps
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Ensure model uses our mps device and set it in training mode
model.to(device)
model.train()

# Grab our Adam optimizer and cross entropy function
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)
cross_entropy_loss = nn.CrossEntropyLoss()

# Iterate through all epochs
for epoch in range(1, EPOCHS + 1):
    total_loss = 0.0
    num_samples = 0
    start_time = time.time()
    
    # Iterate through our 
    for input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Epoch #{epoch} Training"):
        # Move data to device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = out.logits
        
        # Calculate loss
        loss = cross_entropy_loss(logits, labels)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Add to total loss
        total_loss += loss.item() * labels.size(0)
        num_samples += labels.size(0)
        
    # Calculate average loss and epoch training time
    avg_loss = total_loss / num_samples
    end_time = time.time() - start_time
    print(f"    Average loss: {avg_loss}")
    print(f"    Elapsed time: {end_time}")

Using device: mps


Epoch #1 Training: 100%|██████████| 3358/3358 [1:54:59<00:00,  2.05s/it]  


    Average loss: 0.18475088917051163
    Elapsed time: 6899.2397129535675


Epoch #2 Training: 100%|██████████| 3358/3358 [2:02:00<00:00,  2.18s/it]


    Average loss: 0.12124561308869913
    Elapsed time: 7320.659265041351


Epoch #3 Training: 100%|██████████| 3358/3358 [1:55:26<00:00,  2.06s/it]

    Average loss: 0.08462134258727357
    Elapsed time: 6926.306722164154





In [8]:
# SAVE THE MODEL CONFIG AND TOKENIZER VOCAB

model.save_pretrained("bert_fakenews_detection_finetuned")
tokenizer.save_pretrained("bert_fakenews_detection_finetuned")

('bert_fakenews_detection_finetuned/tokenizer_config.json',
 'bert_fakenews_detection_finetuned/special_tokens_map.json',
 'bert_fakenews_detection_finetuned/vocab.txt',
 'bert_fakenews_detection_finetuned/added_tokens.json')

In [9]:
# METRICS EVALUATION

# Ensure model uses our mps device and set it in evaluaiton mode
model.to(device)
model.eval()

# Grab Cross Entropy Loss calculator
cross_entropy_loss = nn.CrossEntropyLoss()

predictions = []
labels_list = []

with torch.no_grad():
    for input_ids, attention_mask, labels in tqdm(test_loader, desc="Metrics Evaluation"):
        # Move data to device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        # Forward pass
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = out.logits
        
        # Calculate loss
        loss = cross_entropy_loss(logits, labels)
        
        # Grab predictions
        argmax_predictions = torch.argmax(logits, dim=1)
        
        # Append our predictions to our list; do the same with the label
        predictions.extend(argmax_predictions.tolist())
        labels_list.extend(labels.tolist())
        
# Compute metrics from labels and predictions
accuracy = accuracy_score(labels_list, predictions)
precision = precision_score(labels_list, predictions, average='macro')
recall = recall_score(labels_list, predictions, average='macro')
f1 = f1_score(labels_list, predictions, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Metrics Evaluation:   0%|          | 0/374 [00:00<?, ?it/s]

Metrics Evaluation: 100%|██████████| 374/374 [02:37<00:00,  2.37it/s]

Accuracy: 0.933830304045565
Precision: 0.9338177426281415
Recall: 0.9337575183635585
F1 Score: 0.933785950207604





In [10]:
# PREDICT AND MULTIPLE_PREDICT FUNCTIONS

def predict(text: str, max_length = MAX_LENGTH, true_threshold = 0.7) -> tuple[str, float]:
    """
    Predicts if a single piece of text is true or false and returns the confidence in it being true.
    
    Parameters:
        text: The string input sequence to check for validity of
        max_length: The maximum length as an integer of the stiring to truncate to for the tokenizer
        true_threshold: The percent confidence of truth that the model has to be to actually put a truth label on a piece of text
    
    Returns:
        tuple[str, float]: A tuple in the format of (truth_label, probability_it_is_true)
    """
    
    # Convert the raw text to input tensors
    encoder = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    
    # Send it to mps device, if available
    input_ids = encoder["input_ids"].to(device)
    attention_mask = encoder["attention_mask"].to(device)
    
    # Ensure model in evaluation mode
    model.eval()
    
    with torch.no_grad():
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = out.logits
        
    # Perform softmax
    probabilities = torch.softmax(logits, dim=-1)[0]
    #prediction = torch.argmax(probabilities).item()
    
    label = None
    probability_true = probabilities[1].item()
    
    # Determine label
    if probability_true >= true_threshold:
        label = "True"
    else:
        label = "Fake"
    
    return label, probability_true

def multiple_predictions(texts: list[str], max_length = MAX_LENGTH, true_threshold = 0.7) -> list[tuple[str, float]]:
    """
    Runs the `predict` function on multiple sequences of text
    
    Parameters:
        texts: The list of string text to pass to the predict function
        max_length: The maximum length as an integer of the string to truncate to for the tokenizer
        true_threshold: The percent confidence of truth that the model has to be to actually put a truth label on a piece of text
        
    Returns:
        list[tuple[str, float]]: A list of tuples respective to the order of the `texts` parameters in the format of (truth_label, probability_it_is_true)
    """
    results = []

    # Loop through and predict
    for text in texts:
        label, probability_true = predict(text, max_length, true_threshold)
        results.append((label, probability_true))
        
        print(f"Input:\n{text}\n\nPrediction: {label}\nProbability it's true: {probability_true}\n\n")
        
    return results

In [11]:
# TEST WITH EXAMPLES

examples = [
    # MISINFO
    "Actor Ed Helms is set to produce and host a new comedy special titled The Fake News with Ted Nelms for Comedy Central.",
    "A Tribe Called Quest shouted 'Resist! Resist! Resist!' at the Grammy Awards.",
    "William R. Ponsoldt had earned tens of millions of dollars building a string of successful companies.",
    "In the Senate, Cruz co-sponsored the 2013 bipartisan Violence Against Women Act.",
    "Senator Counsel Robert Mueller appeared to turn something in during his investigation into the Trump-Russia scandal.",
    # MISINFO - RUSSIAN PROPAGANDA
    "Ukraine is secretly controlled by a Nazi junta installed by the CIA.",
    "The West provoked Russia's 2022 'special military operation' to seize Ukraine's lithium reserves.",
    "NATO forces are massing in Poland to invade Russia next month.",
    "Russia's economy grew by 25 percent in 2023 thanks solely to sanctions.",
    "Ukraine joined the Council of Europe's anti-corruption body GRECO in 2002.",
    # FAKENEWSNET - POLITIFACT
    "The Eiffel Tower is located in Berlin, Germany.",
    "Mount Everest is the highest mountain above sea level on Earth.",
    "COVID-19 is caused by a bacterium that can be treated with antibiotics.",
    "The United States Supreme Court has nine sitting justices.",
    "The Pacific Ocean is the largest ocean on the planet.",
    "The Great Wall of China is visible from the Moon with the naked eye.",
    # FAKENEWSNET - GOSSIPCOP
    "Taylor Swift secretly married Travis Kelce during a private Hawiian ceremony last weekend.",
    "Beyonce now holds the record for the most career Grammy wins of any artist.",
    "Tom Cruise to reboot The Godfather by playing Michael Corleone himself.",
    "Zendaya served as an executive producer on the 2024 film Challengers.",
    "Chriss Pratt says he plans to live on Mars by 2030.",
    "Dolly Parton helped fund research that led to Moderna's COVID-19 vaccine.",
    # LIAR - GENERAL MISINFO
    "Rubbing garlic on your feet cures the common cold overnight.",
    "An adult human skeleton typically contains 206 bones.",
    "5G towers emit radiation that causes memory loss to humans.",
    "Solar panels can now generate power at night by capturing ambient heat.",
    "UNSECO is the UN agency responsible for education, science and culture.",
    "Taking vitamin D supplements cures influenze within 24 hours."
]

multiple_predictions(examples)

Input:
Actor Ed Helms is set to produce and host a new comedy special titled The Fake News with Ted Nelms for Comedy Central.

Prediction: Fake
Probability it's true: 0.3901340365409851


Input:
A Tribe Called Quest shouted 'Resist! Resist! Resist!' at the Grammy Awards.

Prediction: True
Probability it's true: 0.8530953526496887


Input:
William R. Ponsoldt had earned tens of millions of dollars building a string of successful companies.

Prediction: Fake
Probability it's true: 0.6291959881782532


Input:
In the Senate, Cruz co-sponsored the 2013 bipartisan Violence Against Women Act.

Prediction: True
Probability it's true: 0.7113156318664551


Input:
Senator Counsel Robert Mueller appeared to turn something in during his investigation into the Trump-Russia scandal.

Prediction: Fake
Probability it's true: 0.0004938468337059021


Input:
Ukraine is secretly controlled by a Nazi junta installed by the CIA.

Prediction: Fake
Probability it's true: 2.6552506824373268e-05


Input:
The Wes

[('Fake', 0.3901340365409851),
 ('True', 0.8530953526496887),
 ('Fake', 0.6291959881782532),
 ('True', 0.7113156318664551),
 ('Fake', 0.0004938468337059021),
 ('Fake', 2.6552506824373268e-05),
 ('Fake', 4.0099010220728815e-05),
 ('Fake', 9.388518083142117e-05),
 ('Fake', 0.00041635610978119075),
 ('Fake', 0.0003752860939130187),
 ('Fake', 0.0012467927299439907),
 ('True', 0.8555687069892883),
 ('Fake', 0.007726205978542566),
 ('True', 0.8838096857070923),
 ('True', 0.7512489557266235),
 ('Fake', 0.11650975048542023),
 ('True', 0.8306090235710144),
 ('Fake', 0.5853428244590759),
 ('True', 0.9133361577987671),
 ('True', 0.9893688559532166),
 ('Fake', 0.38800209760665894),
 ('Fake', 0.20233440399169922),
 ('True', 0.9819674491882324),
 ('True', 0.9717531800270081),
 ('Fake', 0.25684863328933716),
 ('True', 0.8556768894195557),
 ('Fake', 0.0006377301178872585),
 ('True', 0.9136764407157898)]