In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import softmax

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the BERT tokenizer
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load dataset
df = pd.read_csv('test_file.csv')  # Replace with the path to your CSV file


df = df[['username', 'type', 'post_id', 'title', 'body', 'subreddit', 'label', 'created_utc', 'text']]

# Tokenization function
def tokenize_data(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Convert pandas dataframe to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Tokenize dataset
dataset = dataset.map(tokenize_data, batched=True)

# Remove unnecessary columns
dataset = dataset.remove_columns(['text', 'username', 'type', 'post_id', 'title', 'body', 'subreddit', 'created_utc']).rename_column("label", "labels")
dataset.set_format("torch")

# Load the trained model
checkpoint_path = "results/checkpoint-2346"
model = BertForSequenceClassification.from_pretrained(checkpoint_path).to(device)

# Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids

    # Apply softmax for probability extraction
    if pred.predictions.shape[1] == 2:  # binary classification with logits
        probs = softmax(pred.predictions, axis=1)[:, 1]  # Probability for class 1
    else:
        # If output is already a probability or for single-output models, use it directly
        probs = pred.predictions if pred.predictions.ndim == 1 else pred.predictions[:, 0]

    # Calculate predicted classes from probabilities
    preds = probs > 0.5 if probs.ndim == 1 else pred.predictions.argmax(-1)

    # Standard metrics
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    auc = roc_auc_score(labels, probs)  # AUC score for binary classification

    # Compute ROC curve
    fpr, tpr, _ = roc_curve(labels, probs)

    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc
    }


# Load training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=32,
    report_to=[]  # Disable reporting
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=dataset,
    compute_metrics=compute_metrics
)

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)