#TruthScope

##A Fine-Tuned Stance Classifier

In [1]:
!pip install transformers datasets evaluate --quiet
!pip install nltk --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m44.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency r

In [2]:
import os
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import evaluate
from datasets import Dataset
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Mount / Get Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


You'll need to download the fnc-1 data and upload it to Google Drive. Change the filepath accordingly. \\
https://github.com/FakeNewsChallenge/fnc-1

In [4]:
data_path = "/content/drive/MyDrive/TruthScope/fnc-1/"

# Load csvs
bodies_df = pd.read_csv(os.path.join(data_path, "train_bodies.csv"))
stances_df = pd.read_csv(os.path.join(data_path, "train_stances.csv"))

# Print heads
print("Bodies data:")
print(bodies_df.head())
print("\nStances data:")
print(stances_df.head())


Bodies data:
   Body ID                                        articleBody
0        0  A small meteorite crashed into a wooded area i...
1        4  Last week we hinted at what was to come as Ebo...
2        5  (NEWSER) – Wonder how long a Quarter Pounder w...
3        6  Posting photos of a gun-toting child online, I...
4        7  At least 25 suspected Boko Haram insurgents we...

Stances data:
                                            Headline  Body ID     Stance
0  Police find mass graves with at least '15 bodi...      712  unrelated
1  Hundreds of Palestinians flee floods in Gaza a...      158      agree
2  Christian Bale passes on role of Steve Jobs, a...      137  unrelated
3  HBO and Apple in Talks for $15/Month Apple TV ...     1034  unrelated
4  Spider burrowed through tourist's stomach and ...     1923   disagree


In [None]:
# Merge on body id
merged_df = pd.merge(stances_df, bodies_df, on="Body ID")

# Because we're using RoBERTa, we need to merge everything into one piece of text.
# Use sep token </s> to achieve this
merged_df["text"] = merged_df["Headline"] + " </s> " + merged_df["articleBody"]

# Drop empty
merged_df = merged_df[merged_df["text"].str.strip() != ""]

In [None]:
# Map stance
label_map = {"agree": 0, "disagree": 1, "discuss": 2, "unrelated": 3}
merged_df['label'] = merged_df['Stance'].str.lower().map(label_map)

# Drop NaN
merged_df = merged_df.dropna(subset=['label'])

# Convert label col to int
merged_df['label'] = merged_df['label'].astype('int64')

##Data Balancing
The data is severely unbalanced, so I evened out the samples. This gives us a smaller training set, but, counterintuitively, better performance overall.

In [7]:
# Print unique
print("Label counts before balancing:")
print(merged_df['label'].value_counts())

# Num samples chosen
target_count = 1000

# Split the merged data into train and eval
split = merged_df.sample(frac=0.8, random_state=42)
eval_df = merged_df.drop(split.index)

# Balance
def balance_df(df, target_count):
    # Group by label and resample
    balanced = df.groupby("label", group_keys=False).apply(
        lambda group: group.sample(n=target_count, replace=(len(group) < target_count))
    )
    return balanced

balanced_train_df = balance_df(split, target_count)

print("\nLabel counts after balancing training data:")
print(balanced_train_df['label'].value_counts())

# Now convert the balanced df to HF dataset
# Had to rebuild text col for some reason
balanced_train_df["text"] = balanced_train_df["Headline"] + " </s> " + balanced_train_df["articleBody"]
balanced_train_dataset = Dataset.from_pandas(balanced_train_df[['text', 'label']])

# Using original eval set bc no reason not to
eval_df["text"] = eval_df["Headline"] + " </s> " + eval_df["articleBody"]
eval_dataset = Dataset.from_pandas(eval_df[['text', 'label']])


Label counts before balancing:
label
3    36545
2     8909
0     3678
1      840
Name: count, dtype: int64

Label counts after balancing training data:
label
0    1000
1    1000
2    1000
3    1000
Name: count, dtype: int64


  balanced = df.groupby("label", group_keys=False).apply(


In [8]:
# Create an HF dataset
fnc_dataset = Dataset.from_pandas(merged_df[['text', 'label']])


##Tokenization

In [9]:
model_name = "distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

balanced_train_dataset = balanced_train_dataset.map(tokenize_function, batched=True)
balanced_train_dataset = balanced_train_dataset.remove_columns(["text"])
balanced_train_dataset.set_format("torch")

eval_dataset = eval_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.remove_columns(["text"])
eval_dataset.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9994 [00:00<?, ? examples/s]

##Load RoBERTa model and *freeze encoder layers*
We on that free Colab credit grind \\
Additionally, performance would probably deteriorate w/out freezing

In [10]:

num_labels = 4  # agree, disagree, discusses, unrelated
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Freeze all layers in the encoder
freeze_layers = True
if freeze_layers:
    for param in model.roberta.parameters():
        param.requires_grad = False
    # For some reason, performs much better if unfreeze last layer
    for param in model.roberta.encoder.layer[-1].parameters():
        param.requires_grad = True

# Print all trainable params
trainable = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable parameters:", trainable)


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters: ['roberta.encoder.layer.5.attention.self.query.weight', 'roberta.encoder.layer.5.attention.self.query.bias', 'roberta.encoder.layer.5.attention.self.key.weight', 'roberta.encoder.layer.5.attention.self.key.bias', 'roberta.encoder.layer.5.attention.self.value.weight', 'roberta.encoder.layer.5.attention.self.value.bias', 'roberta.encoder.layer.5.attention.output.dense.weight', 'roberta.encoder.layer.5.attention.output.dense.bias', 'roberta.encoder.layer.5.attention.output.LayerNorm.weight', 'roberta.encoder.layer.5.attention.output.LayerNorm.bias', 'roberta.encoder.layer.5.intermediate.dense.weight', 'roberta.encoder.layer.5.intermediate.dense.bias', 'roberta.encoder.layer.5.output.dense.weight', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.5.output.LayerNorm.weight', 'roberta.encoder.layer.5.output.LayerNorm.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']


##Training args

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,               # 5 epochs is generally good w these params
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Use eval library
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    result = accuracy_metric.compute(predictions=predictions, references=labels)
    return result




Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

##Init Trainer for fine tuning

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=balanced_train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhilton_petersen[0m ([33mhilton_petersen-brown-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9069,0.720541,0.73404
2,0.5706,0.466872,0.835101
3,0.4793,0.355169,0.870622
4,0.3723,0.364904,0.879027
5,0.4206,0.306871,0.895337


TrainOutput(global_step=2500, training_loss=0.641031404876709, metrics={'train_runtime': 1228.4855, 'train_samples_per_second': 16.28, 'train_steps_per_second': 2.035, 'total_flos': 2649442467840000.0, 'train_loss': 0.641031404876709, 'epoch': 5.0})

In [15]:
# Pick a folder in Drive to save the model weights
save_path = '/content/drive/MyDrive/TruthScope.pth'

# Save the model state_dict as .pth
torch.save(model.state_dict(), save_path)

print("Model weights saved to:", save_path)

Model weights saved to: /content/drive/MyDrive/TruthScope.pth


In [16]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

Evaluation results: {'eval_loss': 0.3068709075450897, 'eval_accuracy': 0.8953372023213928, 'eval_runtime': 139.3635, 'eval_samples_per_second': 71.712, 'eval_steps_per_second': 4.485, 'epoch': 5.0}


In [17]:
predictions_output = trainer.predict(eval_dataset)
preds = np.argmax(predictions_output.predictions, axis=-1)
labels = predictions_output.label_ids

In [18]:
!pip install scikit-learn --quiet

##Results

Decent results overall. Recall is generally quite good. Precision is all over the place. Model struggles with disagreement in general. May tune it later. Class weighting did not help. Would love some better data.

In [19]:
from sklearn.metrics import confusion_matrix, classification_report

# Get confusion matrix:
cm = confusion_matrix(labels, preds)
print("Confusion Matrix:")
print(cm)

# Generate report
target_names = ["agree", "disagree", "unrelated", "discuss"]
report = classification_report(labels, preds, target_names=target_names)
print("Classification Report:\n", report)

Confusion Matrix:
[[ 537  119   90   11]
 [  16  140   16    4]
 [ 215  139 1360   36]
 [ 114   90  196 6911]]
Classification Report:
               precision    recall  f1-score   support

       agree       0.61      0.71      0.66       757
    disagree       0.29      0.80      0.42       176
   unrelated       0.82      0.78      0.80      1750
     discuss       0.99      0.95      0.97      7311

    accuracy                           0.90      9994
   macro avg       0.68      0.81      0.71      9994
weighted avg       0.92      0.90      0.91      9994

