## Preamble

In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from tqdm.notebook import tqdm

# Please enter the PATH of your training data here
df = pd.read_csv('test_final.csv')

The data should be cleaned such that it has the columns `cleaned_text` and `classification`.
|...|cleaned_text|classification|...|
|:---:|:---:|:---:|:---:|
|$\vdots$|review 1|label 1|$\vdots$|
|$\vdots$|$\vdots$|$\vdots$|$\vdots$|

## Code

In [None]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_reviews(reviews, tokenizer, max_length=128):
    """
    Tokenizes a list of reviews using the BERT tokenizer.

    Args:
        reviews (list): A list of text reviews.
        max_length (int): The maximum sequence length for padding.

    Returns:
        A dictionary containing input_ids, attention_mask, and token_type_ids.
    """

    # Tokenize and encode the reviews
    encoded_dict = tokenizer.batch_encode_plus(
        reviews,
        add_special_tokens=True,      # Add '[CLS]' and '[SEP]'
        max_length=max_length,        # Pad and truncate all reviews
        padding='max_length',         # Pad to the max_length
        truncation=True,              # Truncate sequences to max_length
        return_attention_mask=True,   # Return attention mask
        return_tensors='pt',          # Return PyTorch tensors
    )

    return encoded_dict

# Get the list of reviews
reviews = df['cleaned_text'].tolist()

# Tokenize the reviews
encoded_data = tokenize_reviews(reviews,tokenizer)

# You now have the encoded data ready for training a BERT model
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']
labels = df['classification'].values  # Get the labels as a NumPy array

print("Input IDs shape:", input_ids.shape)
print("Attention Mask shape:", attention_mask.shape)

In [None]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=42, test_size=0.2
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_mask, labels, random_state=42, test_size=0.2
)

In [None]:
# Convert all data to PyTorch tensors
train_inputs_tensors = torch.tensor(train_inputs)
validation_inputs_tensors = torch.tensor(validation_inputs)

train_labels_tensors = torch.tensor(train_labels, dtype=torch.long)
validation_labels_tensors = torch.tensor(validation_labels, dtype=torch.long)

train_masks_tensors = torch.tensor(train_masks)
validation_masks_tensors = torch.tensor(validation_masks)

# Create the TensorDataset
train_dataset = TensorDataset(train_inputs_tensors, train_masks_tensors, train_labels_tensors)
validation_dataset = TensorDataset(validation_inputs_tensors, validation_masks_tensors, validation_labels_tensors)

In [None]:
batch_size = 32

# Create the training DataLoader
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),  # Randomly select batches for training
    batch_size=batch_size
)

# Create the validation DataLoader
validation_dataloader = DataLoader(
    validation_dataset,
    sampler=SequentialSampler(validation_dataset),  # Process batches in order for validation
    batch_size=batch_size
)

In [None]:


# Define the number of output labels based on your problem
# If you have two classes (e.g., good/bad quality), num_labels=2.
num_labels = 2

# Check if a GPU is available and use it
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Load the pre-trained BERT model with a classification head
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,
    output_attentions=False,
    output_hidden_states=False
)

# Send the model to the GPU if available
model.to(device)

print(f"Model loaded and sent to {device}")

In [None]:
# Hyperparameters
epochs = 3
learning_rate = 2e-5 
epsilon = 1e-8

# Get all of the model's parameters as a list of tuples.
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

# Calculate the total number of training steps
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
# Suppose you have these counts
num_relevant = 10000
num_irrelevant = 384

# Compute weights: total / (num_classes * class_count)
weight_relevant = (num_relevant + num_irrelevant) / (2 * num_relevant)
weight_irrelevant = (num_relevant + num_irrelevant) / (2 * num_irrelevant)

class_weights = torch.tensor([weight_irrelevant,weight_relevant]).to(device)
print(class_weights)

In [None]:
# Defining a custom loss function
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('Training...')

    model.train()
    total_train_loss = 0

    # Iterate over batches of data
    for batch in tqdm(train_dataloader):
        # Unpack the batch and send tensors to the GPU
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Clear any previously calculated gradients
        model.zero_grad()

        # Perform a forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # labels=labels
        )
        
        # Calculate loss and perform backpropagation
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        # loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()

        preds = torch.argmax(logits, dim=1)
        num_misclassified_irrelevant = ((preds != labels) & (labels == 0)).sum().item()
        num_misclassified_relevant = ((preds != labels) & (labels == 1)).sum().item()
        # print(num_misclassified_irrelevant, num_misclassified_relevant)
        
        # Clip the norm of the gradients to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update model parameters
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over the epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss:.4f}')
    
    # Validation step
    print('\nValidating...')
    model.eval()
    total_eval_accuracy = 0

    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Tell PyTorch not to compute gradients
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        labels = labels.to('cpu').numpy()
        
        # Calculate accuracy
        predictions = np.argmax(logits, axis=1).flatten()
        total_eval_accuracy += np.sum(predictions == labels)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataset)
    print(f'Validation Accuracy: {avg_val_accuracy:.4f}')

print('\nTraining complete!')

In [None]:
# Define save directory
save_directory = "model"

# Save model
model.save_pretrained(save_directory)

# Save tokenizer (make sure it's the same one you used for training)
tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")