Please run this notebook in Google Colaboratory.

Please upload your unlabelled csv to this folder:

https://drive.google.com/drive/folders/1LISkmkab8S7DBECTFb65DqD3a3_GSeI8?usp=sharing

Please note that the table should contain a `cleaned_text` column.

|...|cleaned_text|...|
|:---:|:---:|:---:|
|$\vdots$|review 1|$\vdots$|
|$\vdots$|$\vdots$|$\vdots$|

This script will save a labelled csv named labaled.csv

# Preamble

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)
import pandas as pd
import numpy as np
import gdown
import glob
import os
import shutil

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
'''
This folder cointains our model and tokenizer
'''
model_folder_id = '1mfbSHZ8pVC4WOvWPCDma-8n4A25dtty2'
gdown.download_folder(id=model_folder_id,output="model",quiet=False,use_cookies=False)

In [None]:
'''
This folder should contain your a csv of all the reviews you want to label
'''
test_folder_id = '1LISkmkab8S7DBECTFb65DqD3a3_GSeI8'
gdown.download_folder(id=test_folder_id,output="test",quiet=False,use_cookies=False)

In [None]:
downloaded_folder = "test"

csv_files = glob.glob(os.path.join(downloaded_folder, "*.csv"))

if len(csv_files) != 1:
    raise ValueError(f"Expected 1 CSV in '{downloaded_folder}', found {len(csv_files)}")

csv_file_path = csv_files[0]
new_name = os.path.join(downloaded_folder, "reviews.csv")
os.rename(csv_file_path, new_name)

print(f"CSV downloaded and renamed to: {new_name}")

# Data Preparation

In [None]:
df_test = pd.read_csv('/content/test/reviews.csv')
display(df_test.head())

In [None]:
tokenizer = BertTokenizer.from_pretrained('/content/model/')

tokenized_inputs = tokenizer(
    df_test['cleaned_text'].tolist(),
    add_special_tokens=True,
    padding=True,
    truncation=True,
    max_length=128,
    return_attention_mask=True,
    return_tensors='pt'
)

test_dataset = TensorDataset(tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'])
test_dataloader = DataLoader(test_dataset, batch_size=32)

print("Tokenizer loaded successfully.")
print("Text data tokenized and DataLoader created.")

# Load model.


In [None]:
model = BertForSequenceClassification.from_pretrained('/content/model/')
model.to(device)
model.eval()

print("Model loaded successfully and moved to device.")

# Perform inference


In [None]:
import torch

all_logits = []

for batch in test_dataloader:
    input_ids, attention_mask = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        all_logits.append(logits)

all_logits = torch.cat(all_logits, dim=0)
print("Inference completed and logits collected.")

# Process predictions


In [None]:
probabilities = torch.softmax(all_logits, dim=1)
predicted_labels = torch.argmax(probabilities, dim=1)
predicted_labels_np = predicted_labels.cpu().numpy()

print("Softmax applied, predicted labels determined, and converted to NumPy array.")

# Save results


In [None]:
df_test['predicted_label'] = predicted_labels_np
df_test.to_csv('labeled.csv', index=False)

print("Predicted labels added to DataFrame and saved to 'test_labeled.csv'.")