In [27]:
import sys
from os import path
import os
from dotenv import load_dotenv

load_dotenv() 
sys.path.append(path.dirname(os.getcwd()))

hf_token = os.environ["HUGGING_FACE_API_KEY"]


In [28]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch
import json
import pandas as pd

In [29]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [33]:
# Random state
RANDOM_STATE = 42

# Folder paths
BASE_OUTPUT_DIR = "../models/roberta"
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
RUN_ID = f"{len(os.listdir(BASE_OUTPUT_DIR)) + 1:03d}"
os.makedirs(f"{BASE_OUTPUT_DIR}/{RUN_ID}", exist_ok=True)
OUTPUT_DIR = f"{BASE_OUTPUT_DIR}/{RUN_ID}"

# Training arguments
EPOCHS = 1
TRAIN_SET_SIZE = 0.75
VALIDATION_SET_SIZE = 0.3 # Of the remaining 1 - TRAIN_SET_SIZE
TRAINING_BATCH_SIZE = 8
VALIDATION_BATCH_SIZE = 4


In [34]:
from project.models import YouTubeVideoInfo

with open("../data/myyounicon-01/videos_infos.json", "r") as json_file:
    videos = [YouTubeVideoInfo.from_json(v) for v in json.load(json_file)]

videos = sorted(videos, key=lambda v: v.id)
video_ids = [v.id for v in videos]
labels = pd.read_csv("../data/YouNiCon/conspiracy_label.csv")
labels = labels[labels["video_id"].isin([v.id for v in videos])]
labels["majority_label"] = pd.to_numeric(labels["majority_label"], errors="coerce")
labels = labels.set_index("video_id").loc[video_ids, "majority_label"].tolist()

conspiracy_videos = sum(labels)

print(f"Total instances: {len(labels)}, conspiracy instances: {conspiracy_videos}, non conspiracy instances: {len(labels)-conspiracy_videos}")

attributes = ["channel_title", "title", "description", "categories", "tags", "subtitles", "auto_subtitles", "comments"]
attributes_settings = {
    "max_subtitles_length": 1000, 
    "include_comments_replies": True,
}

videos_as_text = [v.to_string_for_model_input(attributes_to_include=attributes, **attributes_settings) for v in videos]

train_videos, val_videos, train_labels, val_labels = train_test_split(
    videos_as_text, labels, train_size=TRAIN_SET_SIZE, random_state=RANDOM_STATE
)

val_videos, test_videos, val_labels, test_labels = train_test_split(
    val_videos, val_labels, train_size=VALIDATION_SET_SIZE, random_state=RANDOM_STATE
)

print(f"train dataset size: {len(train_labels)}")
print(f"validation dataset size: {len(val_labels)}")
print(f"test dataset size: {len(test_labels)}")

Total instances: 2515, conspiracy instances: 897, non conspiracy instances: 1618
train dataset size: 1886
validation dataset size: 188
test dataset size: 441


In [14]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def preprocess_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
    encodings['labels'] = labels
    return encodings

train_encodings = preprocess_data(train_videos, train_labels)
val_encodings = preprocess_data(val_videos, val_labels)


In [15]:
class BinaryClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

train_dataset = BinaryClassificationDataset(train_encodings)
val_dataset = BinaryClassificationDataset(val_encodings)

In [16]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAINING_BATCH_SIZE,
    per_device_eval_batch_size=VALIDATION_BATCH_SIZE,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    save_total_limit=2,  # Keep only the 2 most recent checkpoints
    report_to="none",
)

# Define Metrics
def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

# Early Stopping Callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [9]:
trainer.train()

# Save the best model
trainer.save_model(f"{OUTPUT_DIR}/best_model")

In [None]:
from transformers import pipeline
from sklearn.metrics import confusion_matrix

MODEL_DIR = ...

best_model = RobertaForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = RobertaTokenizer.from_pretrained(MODEL_DIR)

evaluation_pipeline = pipeline("text-classification", model=best_model, tokenizer=tokenizer)

predictions = []
for video_text in test_videos:
    scores = evaluation_pipeline(video_text)[0]
    pred_label = int(scores[1]["score"] > 0.5)  # Binary thresholding
    predictions.append(pred_label)

tn, fp, fn, tp = confusion_matrix(test_labels, predictions).ravel()

print("TN\nFP\nFN\nTP")
print(f"{tn}\t{fp}\t{fn}\t{tp}")