In [2]:
import sys
from os import path
import os
from dotenv import load_dotenv

load_dotenv() 
sys.path.append(path.dirname(os.getcwd()))

hf_token = os.environ["HUGGING_FACE_API_KEY"]


In [2]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
import torch
import json

import pandas as pd

In [3]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
# Random state
RANDOM_STATE = 42

# Training arguments
EPOCHS = 10
MAX_INPUT_LENGTH = 512
TRAIN_SET_SIZE = 0.75
TRAINING_BATCH_SIZE = 8
VALIDATION_BATCH_SIZE = 4
VALIDATION_SET_SIZE = 0.3 # Of the remaining 1 - TRAIN_SET_SIZE

In [6]:
# For training from a checkpoint
OUTPUT_DIR = "/home/leoli/Uni/Polimi/Thesis/master-thesis/models/roberta/2025-01-29 00:45:39"

In [37]:
from datetime import datetime, timezone

# Folder paths
BASE_OUTPUT_DIR = "../models/roberta"
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
RUN_ID = f"{datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}"
os.makedirs(f"{BASE_OUTPUT_DIR}/{RUN_ID}", exist_ok=True)
OUTPUT_DIR = f"{BASE_OUTPUT_DIR}/{RUN_ID}"


In [7]:
from project.models import YouTubeVideoInfo

with open("../data/myyounicon-01/videos_infos.json", "r") as json_file:
    videos = [YouTubeVideoInfo.from_json(v) for v in json.load(json_file)]

videos = sorted(videos, key=lambda v: v.id)
video_ids = [v.id for v in videos]
labels = pd.read_csv("../data/YouNiCon/conspiracy_label.csv")
labels = labels[labels["video_id"].isin([v.id for v in videos])]
labels["majority_label"] = pd.to_numeric(labels["majority_label"], errors="coerce")
labels = labels.set_index("video_id").loc[video_ids, "majority_label"].tolist()

conspiracy_videos = sum(labels)

print(f"Total instances: {len(labels)}, conspiracy instances: {conspiracy_videos}, non conspiracy instances: {len(labels)-conspiracy_videos}")

attributes = ["channel_title", "title", "description", "categories", "tags", "subtitles", "auto_subtitles", "comments"]
attributes_settings = {
    "max_subtitles_length": 1000, 
    "include_comments_replies": True,
}

videos_as_text = [v.to_string_for_model_input(attributes_to_include=attributes, **attributes_settings) for v in videos]

train_videos, val_videos, train_labels, val_labels = train_test_split(
    videos_as_text, labels, train_size=TRAIN_SET_SIZE, random_state=RANDOM_STATE
)

val_videos, test_videos, val_labels, test_labels = train_test_split(
    val_videos, val_labels, train_size=VALIDATION_SET_SIZE, random_state=RANDOM_STATE
)

print(f"train dataset size: {len(train_labels)}")
print(f"validation dataset size: {len(val_labels)}")
print(f"test dataset size: {len(test_labels)}")

Total instances: 2515, conspiracy instances: 897, non conspiracy instances: 1618
train dataset size: 1886
validation dataset size: 188
test dataset size: 441


In [8]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def preprocess_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_INPUT_LENGTH)
    encodings['labels'] = labels
    return encodings

train_encodings = preprocess_data(train_videos, train_labels)
val_encodings = preprocess_data(val_videos, val_labels)


In [9]:
class BinaryClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

train_dataset = BinaryClassificationDataset(train_encodings)
val_dataset = BinaryClassificationDataset(val_encodings)

In [10]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAINING_BATCH_SIZE,
    per_device_eval_batch_size=VALIDATION_BATCH_SIZE,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    save_total_limit=2,  # Keep only the 2 most recent checkpoints
    report_to="none",
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

In [None]:
trainer.train()

# Save the best model and tokenizer
trainer.save_model(f"{OUTPUT_DIR}/best_model")

  0%|          | 0/2360 [00:00<?, ?it/s]

{'loss': 0.6968, 'grad_norm': 37.36332321166992, 'learning_rate': 4.978813559322034e-05, 'epoch': 0.04}
{'loss': 0.5959, 'grad_norm': 19.72719955444336, 'learning_rate': 4.957627118644068e-05, 'epoch': 0.08}
{'loss': 0.6505, 'grad_norm': 2.6474087238311768, 'learning_rate': 4.936440677966102e-05, 'epoch': 0.13}
{'loss': 0.6643, 'grad_norm': 5.114778995513916, 'learning_rate': 4.915254237288136e-05, 'epoch': 0.17}
{'loss': 0.6301, 'grad_norm': 2.3885669708251953, 'learning_rate': 4.89406779661017e-05, 'epoch': 0.21}
{'loss': 0.731, 'grad_norm': 5.777268409729004, 'learning_rate': 4.8728813559322034e-05, 'epoch': 0.25}
{'loss': 0.5892, 'grad_norm': 1.5416439771652222, 'learning_rate': 4.851694915254237e-05, 'epoch': 0.3}
{'loss': 0.6701, 'grad_norm': 2.840379238128662, 'learning_rate': 4.8305084745762714e-05, 'epoch': 0.34}
{'loss': 0.6042, 'grad_norm': 4.874413013458252, 'learning_rate': 4.809322033898305e-05, 'epoch': 0.38}
{'loss': 0.5709, 'grad_norm': 16.4432373046875, 'learning_rate

  0%|          | 0/47 [00:00<?, ?it/s]

{'eval_loss': 0.6555712223052979, 'eval_accuracy': 0.7340425531914894, 'eval_f1': 0.6621621621621622, 'eval_precision': 0.6621621621621622, 'eval_recall': 0.6621621621621622, 'eval_runtime': 64.808, 'eval_samples_per_second': 2.901, 'eval_steps_per_second': 0.725, 'epoch': 1.0}


## Train From Checkpoint

In [None]:
trainer.train(resume_from_checkpoint="/home/leoli/Uni/Polimi/Thesis/master-thesis/models/roberta/2025-01-29 00:45:39/checkpoint-236")

trainer.save_model("/home/leoli/Uni/Polimi/Thesis/master-thesis/models/roberta/2025-01-29 00:45:39/best_model")

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


  0%|          | 0/2360 [00:00<?, ?it/s]

  checkpoint_rng_state = torch.load(rng_file)


{'loss': 0.1413, 'grad_norm': 7.008457660675049, 'learning_rate': 4.491525423728814e-05, 'epoch': 1.02}
{'loss': 0.3905, 'grad_norm': 16.55491828918457, 'learning_rate': 4.470338983050847e-05, 'epoch': 1.06}
{'loss': 0.5022, 'grad_norm': 12.508938789367676, 'learning_rate': 4.4491525423728816e-05, 'epoch': 1.1}
{'loss': 0.4238, 'grad_norm': 15.769083023071289, 'learning_rate': 4.427966101694915e-05, 'epoch': 1.14}
{'loss': 0.2878, 'grad_norm': 2.8718972206115723, 'learning_rate': 4.4067796610169495e-05, 'epoch': 1.19}
{'loss': 0.6175, 'grad_norm': 9.003098487854004, 'learning_rate': 4.385593220338983e-05, 'epoch': 1.23}
{'loss': 0.5005, 'grad_norm': 9.979904174804688, 'learning_rate': 4.3644067796610175e-05, 'epoch': 1.27}
{'loss': 0.2397, 'grad_norm': 17.84249496459961, 'learning_rate': 4.343220338983051e-05, 'epoch': 1.31}
{'loss': 0.533, 'grad_norm': 18.96619415283203, 'learning_rate': 4.3220338983050854e-05, 'epoch': 1.36}
{'loss': 0.4788, 'grad_norm': 4.204215049743652, 'learning_

  0%|          | 0/47 [00:00<?, ?it/s]

{'eval_loss': 0.6093281507492065, 'eval_accuracy': 0.7074468085106383, 'eval_f1': 0.4329896907216495, 'eval_precision': 0.9130434782608695, 'eval_recall': 0.28378378378378377, 'eval_runtime': 64.0574, 'eval_samples_per_second': 2.935, 'eval_steps_per_second': 0.734, 'epoch': 2.0}


## Evaluation

In [None]:
from transformers import pipeline
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

MODEL_DIR = "/home/leoli/Uni/Polimi/Thesis/master-thesis/models/roberta/2025-01-27 15:41:28/best_model"

best_model = RobertaForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

evaluation_pipeline = pipeline("text-classification", model=best_model, tokenizer=tokenizer)

predictions = []
for video_text in tqdm(test_videos):
    score = evaluation_pipeline(video_text, truncation=True, padding=True, max_length=MAX_INPUT_LENGTH)[0]
    pred_label = 0 if score["label"] == "LABEL_0" else 1
    predictions.append(pred_label)

tn, fp, fn, tp = confusion_matrix(test_labels, predictions).ravel()

print("TN\tFP\tFN\tTP")
print(f"{tn}\t{fp}\t{fn}\t{tp}")

Device set to use cpu
100%|██████████| 441/441 [02:51<00:00,  2.57it/s]

TN
FP
FN
TP
225	47	62	107



