In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
%lsmagic

Available line magics:
%alias  %alias_magic  %autoawait  %autocall  %automagic  %autosave  %bookmark  %cat  %cd  %clear  %code_wrap  %colors  %conda  %config  %connect_info  %cp  %debug  %dhist  %dirs  %doctest_mode  %ed  %edit  %env  %gui  %hist  %history  %killbgscripts  %ldir  %less  %lf  %lk  %ll  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %lx  %macro  %magic  %mamba  %man  %matplotlib  %micromamba  %mkdir  %more  %mv  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %pip  %popd  %pprint  %precision  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %rep  %rerun  %reset  %reset_selective  %rm  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %uv  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%code_wrap  %%debug  %%file  %%html  %%javascript  %%

In [3]:
torch.cuda.is_available()

True

## Step 2: Prepare Data

### Sample Data

In [4]:
training_data = [
    # Factual claims (label = 1)
    ("John Smith was elected mayor in 2020", 1),
    ("The company reported $2 million in revenue", 1),
    ("She graduated from Harvard University", 1),
    ("The meeting was scheduled for 3 PM", 1),
    ("COVID-19 cases increased by 15% last month", 1),
    # Opinions (label = 0)
    ("This is the best restaurant in town", 0),
    ("We should invest more in education", 0),
    ("That movie was terrible", 0),
    ("This policy is unfair to working families", 0),
    ("Climate change is the most important issue", 0),
]

## Convert to Dataframe

In [5]:
df = pd.DataFrame(training_data, columns=['text', 'label'])
print(f"Dataset size: {len(df)}")
print(df.head())

Dataset size: 10
                                         text  label
0        John Smith was elected mayor in 2020      1
1  The company reported $2 million in revenue      1
2       She graduated from Harvard University      1
3          The meeting was scheduled for 3 PM      1
4  COVID-19 cases increased by 15% last month      1


### Split Data

In [6]:
# Split into train/validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), 
    df['label'].tolist(), 
    test_size=0.2, 
    random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 8
Validation samples: 2


## Step 3: Load and Setup BERT

### Initialize Tokenizer and Model

In [7]:
# Choose your BERT variant
model_name = "bert-base-uncased"  # Good starting point
# Alternatives: "roberta-base", "distilbert-base-uncased" (faster)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2  # Binary classification: claim vs opinion
)

print(f"Model loaded: {model_name}")
print(f"Vocabulary size: {tokenizer.vocab_size}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: bert-base-uncased
Vocabulary size: 30522


### Tokenize Data

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        truncation=True, 
        padding=True, 
        max_length=128  # Adjust based on your text length
    )

# Create datasets
train_dataset = Dataset.from_dict({
    'text': train_texts,
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'text': val_texts,
    'labels': val_labels
})

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

print("Data tokenized successfully!")

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Data tokenized successfully!


## Step 4: Fine-Tune Model

We are doing **transfer learning** with **fine-tuning**. 
BERT was pre-trained to understand language - Thank you!
We fine-tuning the model for a specific task - claim vs opinion here.
The technique = Supervised learning with backpropagation

Deep dive: BERT has millions of weights to understand language. We are adjusting these to suit our classification task. Only our final classification layer is learning from scratch. The rest of BERT is merely adapting instead of being completely retrained. 
BERT (I think) expects a "[MASK]" token to predict values. 
By fine-tuning, we add a layer like: `input text -> BERT Encoder -> Classification Head -> [Claim, Opinion] probabilities`.

### Define Training Arguments

[transformers.TrainingArguments](https://huggingface.co/docs/transformers/v4.52.3/en/main_classes/trainer#transformers.TrainingArguments) has a lot of parameters. 

In [9]:
training_args = TrainingArguments(
    output_dir='./bert-claim-classifier',
    num_train_epochs=3,              # Start with 3, adjust based on results
    per_device_train_batch_size=16,  # Reduce if memory issues
    per_device_eval_batch_size=16,
    warmup_steps=500, # gradually increase learning rate over 500 steps | prevents huge descrutive changes early on
    weight_decay=0.01, # Very mild 1% to prevent memorizing training daata exactly. 
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

### Define Evaluation Metrics

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Initialize and Train

This is the fun part we all want to do :)

In [11]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Start training
print("Starting training...")
trainer.train()

# Save the model
trainer.save_model('./bert-claim-classifier')
tokenizer.save_pretrained('./bert-claim-classifier')
print("Model saved!")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.680923,0.5,0.333333,0.25,0.5
2,No log,0.680706,0.5,0.333333,0.25,0.5
3,No log,0.680317,0.5,0.333333,0.25,0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Model saved!


## Step 5: Test Model

### Load Trained Model for Testing

In [13]:
# Load your fine-tuned model
classifier = pipeline(
    "text-classification",
    model="./bert-claim-classifier",
    tokenizer="./bert-claim-classifier"
)

# Test sentences
test_sentences = [
    "Barack Obama was president from 2009 to 2017",  # Should be factual
    "Pizza is the most delicious food ever",         # Should be opinion
    "The stock market closed at 4,500 points",      # Should be factual
    "This movie deserves an Oscar",                  # Should be opinion
]

print("Testing the model:")
for sentence in test_sentences:
    result = classifier(sentence)
    label = "Factual Claim" if result[0]['label'] == 'LABEL_1' else "Opinion"
    confidence = result[0]['score']
    print(f"Text: '{sentence}'")
    print(f"Prediction: {label} (confidence: {confidence:.3f})")
    print("-" * 50)

Device set to use cuda:0


Testing the model:
Text: 'Barack Obama was president from 2009 to 2017'
Prediction: Factual Claim (confidence: 0.558)
--------------------------------------------------
Text: 'Pizza is the most delicious food ever'
Prediction: Factual Claim (confidence: 0.579)
--------------------------------------------------
Text: 'The stock market closed at 4,500 points'
Prediction: Opinion (confidence: 0.531)
--------------------------------------------------
Text: 'This movie deserves an Oscar'
Prediction: Factual Claim (confidence: 0.562)
--------------------------------------------------


### Manual Evaluation Function

In [14]:
def evaluate_model(texts, true_labels):
    """Evaluate model on a list of texts with known labels"""
    predictions = []
    
    for text in texts:
        result = classifier(text)
        # Convert to binary (0 or 1)
        pred = 1 if result[0]['label'] == 'LABEL_1' else 0
        predictions.append(pred)
    
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average='weighted'
    )
    
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1-score: {f1:.3f}")
    
    return predictions

# Example usage:
test_texts = ["Company revenue increased 20%", "I think this is wrong"]
test_labels = [1, 0]  # 1 = factual, 0 = opinion
predictions = evaluate_model(test_texts, test_labels)

Accuracy: 0.000
Precision: 0.000
Recall: 0.000
F1-score: 0.000


## Step 6: Integration With Fact-Checker

In [21]:
def extract_claims_from_text(text):
    """
    Extract potential factual claims from text
    Returns list of sentences classified as factual claims
    """
    # Simple sentence splitting (you might want to use spaCy for better results)
    sentences = text.split('. ')
    print(sentences)
    
    claims = []
    for sentence in sentences:
        if len(sentence.strip()) > 10:  # Skip very short sentences
            print(sentence)
            result = classifier(sentence)
            print(result)
            if result[0]['label'] == 'LABEL_1':  # Factual claim
                claims.append({
                    'text': sentence,
                    'confidence': result[0]['score']
                })
    
    return claims

# Test with a Twitter example
twitter_text = """My opponent Denver Riggleman, running mate of Corey Stewart, was caught on camera campaigning with a white supremacist. Now he has been exposed as a devotee of Bigfoot erotica. This is not what we need on Capitol Hill."""

claims = extract_claims_from_text(twitter_text)
print(f"Extracted claims: {claims}")
for claim in claims:
    print(f"- {claim['text']} (confidence: {claim['confidence']:.3f})")

['My opponent Denver Riggleman, running mate of Corey Stewart, was caught on camera campaigning with a white supremacist', 'Now he has been exposed as a devotee of Bigfoot erotica', 'This is not what we need on Capitol Hill.']
My opponent Denver Riggleman, running mate of Corey Stewart, was caught on camera campaigning with a white supremacist
[{'label': 'LABEL_0', 'score': 0.5075468420982361}]
Now he has been exposed as a devotee of Bigfoot erotica
[{'label': 'LABEL_0', 'score': 0.571496844291687}]
This is not what we need on Capitol Hill.
[{'label': 'LABEL_0', 'score': 0.5415619015693665}]
Extracted claims: []
