In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc




In [3]:
dataset = load_dataset('csv', data_files={'train': 'processed_data/train.csv', 'test': 'processed_data/test.csv'})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch')
tokenizer.save_pretrained(f'./final_models/bert_tokenizer')


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

('./final_models/bert_tokenizer\\tokenizer_config.json',
 './final_models/bert_tokenizer\\special_tokens_map.json',
 './final_models/bert_tokenizer\\vocab.txt',
 './final_models/bert_tokenizer\\added_tokens.json',
 './final_models/bert_tokenizer\\tokenizer.json')

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [None]:
model_name = 'bert-base-uncased'
print(f"\nTraining with: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=f"./results/{model_name}",
    do_eval=True,
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir=f"./logs/{model_name}"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(f"./final_models/{model_name}")
trainer.evaluate()


In [None]:
model_name = 'roberta-base'
print(f"\nTraining with: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=f"./results/{model_name}",
    do_eval=True,
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir=f"./logs/{model_name}"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(f"./final_models/{model_name}")
trainer.evaluate()


In [None]:
model_name = 'distilbert-base-uncased'
print(f"\nTraining with: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=f"./results/{model_name}",
    do_eval=True,
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir=f"./logs/{model_name}"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(f"./final_models/{model_name}")
trainer.evaluate()


In [None]:
model_name = 'albert-base-v2'
print(f"\nTraining with: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=f"./results/{model_name}",
    do_eval=True,
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir=f"./logs/{model_name}"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(f"./final_models/{model_name}")
trainer.evaluate()


In [None]:
model_name = 'google/electra-base-discriminator'
print(f"\nTraining with: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir=f"./results/{model_name.replace('/', '_')}",
    do_eval=True,
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir=f"./logs/{model_name.replace('/', '_')}"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(f"./final_models/{model_name.replace('/', '_')}")
trainer.evaluate()


In [None]:
def predict_and_visualize(trainer, test_dataset, label_names, title_prefix="Binary"):
    # Get raw predictions
    preds_output = trainer.predict(test_dataset)
    probs = torch.nn.functional.softmax(torch.tensor(preds_output.predictions), dim=1).numpy()
    preds = np.argmax(probs, axis=1)
    labels = preds_output.label_ids

    # Confusion Matrix
    cm = confusion_matrix(labels, preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"{title_prefix} Classification - Confusion Matrix")
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(labels, probs[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"{title_prefix} Classification - ROC Curve")
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

    return preds, probs

binary_preds, binary_probs = predict_and_visualize(
    trainer,
    tokenized_datasets['test'],
    label_names=["Real", "Fake"],
    title_prefix="Fake Detection"
)


In [None]:
df_binary = pd.DataFrame({
    "text": dataset['test']['text'],
    "true_label": dataset['test']['label'],
    "predicted_label": binary_preds,
    "prob_real": binary_probs[:, 0],
    "prob_fake": binary_probs[:, 1],
})
df_binary.to_csv("binary_predictions.csv", index=False)