Please run this notebook in Google Colaboratory.

Please upload your labelled testing csv to this folder:

https://drive.google.com/drive/folders/1LISkmkab8S7DBECTFb65DqD3a3_GSeI8?usp=sharing

Please note that the table should contain a `cleaned_text` and a `classification` column.

|...|cleaned_text|classification|...|
|:---:|:---:|:---:|:---:|
|$\vdots$|review 1|label 1|$\vdots$|
|$\vdots$|$\vdots$|$\vdots$|$\vdots$|


# Preamble

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)
import pandas as pd
import numpy as np
import gdown
import glob
import os
import shutil

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
'''
This folder cointains our model and tokenizer
'''
model_folder_id = '1mfbSHZ8pVC4WOvWPCDma-8n4A25dtty2'
gdown.download_folder(id=model_folder_id,output="model",quiet=False,use_cookies=False)

In [None]:
'''
This folder should contain your testing files
'''
test_folder_id = '1LISkmkab8S7DBECTFb65DqD3a3_GSeI8'
gdown.download_folder(id=test_folder_id,output="test",quiet=False,use_cookies=False)

In [None]:
downloaded_folder = "test"

csv_files = glob.glob(os.path.join(downloaded_folder, "*.csv"))

if len(csv_files) != 1:
    raise ValueError(f"Expected 1 CSV in '{downloaded_folder}', found {len(csv_files)}")

csv_file_path = csv_files[0]
new_name = os.path.join(downloaded_folder, "reviews.csv")
os.rename(csv_file_path, new_name)

print(f"CSV downloaded and renamed to: {new_name}")

# Data Preparation

In [None]:
# Load Data
df = pd.read_csv('/content/test/reviews.csv')

# Extract text and labels
texts = df['cleaned_text'].tolist()
labels = df['classification'].tolist()  # assuming 0/1 integers

# Load tokenizer
save_directory = '/content/model'
tokenizer = BertTokenizer.from_pretrained(save_directory)

# Tokenize the dataset
encoded_dict = tokenizer.batch_encode_plus(
    texts,
    add_special_tokens=True,      # Add '[CLS]' and '[SEP]'
    max_length=128,        # Pad and truncate all reviews
    padding='max_length',         # Pad to the max_length
    truncation=True,              # Truncate sequences to max_length
    return_attention_mask=True,   # Return attention mask
    return_tensors='pt',          # Return PyTorch tensors
)

# Convert labels to tensor
labels_tensor = torch.tensor(labels)

# Wrap everything into a TensorDataset and DataLoader
dataset = TensorDataset(encoded_dict['input_ids'], encoded_dict['attention_mask'], labels_tensor)
dataloader = DataLoader(dataset, batch_size=32)  # adjust batch size as needed


# Model

In [None]:
# Load the model from the directory
model = BertForSequenceClassification.from_pretrained(save_directory)

# Move the model to the correct device
model.to(device)
model.eval() # Set the model to evaluation mode

y_true = []
y_pred = []
y_prob = []

# Make a prediction
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels_batch = batch[2].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Softmax to get probabilities
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)

        y_true.extend(labels_batch.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())
        y_prob.extend(probs[:, 1].cpu().numpy())  # prob of class "Relevant" (index 1)

# Evaluation

In [None]:
# Basic metrics
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred, zero_division=0))
print("Recall:", recall_score(y_true, y_pred, zero_division=0))
print("F1:", f1_score(y_true, y_pred, zero_division=0))
print("ROC AUC:", roc_auc_score(y_true, y_prob))

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, zero_division=0))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))