In [60]:
import torch
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader

data_dir = "../data"
output_dir = "./output"
peft_model_id = f"{output_dir}/lora_epoch_3"

def clean_text_data(input: str):
  """
  Clean text data by removing special characters and extra spaces
  """
  return ' '.join(input.split())


def load_and_prepare_dataset(data_file, tokenizer, batch_size):
  """
  Helper function to load and prepare a CSV dataset for training or testing
  """
  def tokenize_fn(examples):
    examples['essay'] = [clean_text_data(essay) for essay in examples['essay']]
    return tokenizer(examples['essay'], truncation=True, padding='max_length', max_length=512)
  dataset = load_dataset('csv', data_files={'train': data_file})
  # dataset['train'] = dataset['train'].select(range(2000))
  tokenized_datasets = dataset.map(tokenize_fn, batched=True)
  tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
  tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
  return DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=False)


# Load the pre-trained model
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

# Load the LoRA model
model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()

# Move model to GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print("Loaded model")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model




In [65]:
text = "test"
inputs = tokenizer([text], return_tensors="pt", padding="max_length", truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
  outputs = model(**inputs)
  logits = outputs.logits

logits

tensor([[0.0414, 0.3616]], device='cuda:0')

In [61]:
# Load the dataset and tokenize
eval_dataloader = load_and_prepare_dataset(f'{data_dir}/test.csv', tokenizer, batch_size=4)

# Initialize metrics storage
all_predictions = []
all_labels = []

# Evaluation loop
progress_bar = tqdm(eval_dataloader, desc="Eval")
smoothed_accuracy = []

try:
  with torch.no_grad():
    for idx, batch in enumerate(progress_bar):
      labels = batch['labels']
      batch = {k: v.to(device) for k, v in batch.items()}
      labels = labels.to(device)

      # Forward pass
      outputs = model(**batch)
      logits = outputs.logits

      # Predictions and labels
      predictions = torch.argmax(logits, dim=-1)
      all_predictions.extend(predictions.cpu().numpy())
      all_labels.extend(labels.cpu().numpy())
      
      accuracy = (predictions == batch["labels"]).float().mean().item()
      if (idx % 100 == 0):
        print(predictions.cpu().numpy())
        print(labels.cpu().numpy())
        print(predictions == batch["labels"])
      smoothed_accuracy.append(accuracy)

      if len(smoothed_accuracy) > 100:
        smoothed_accuracy.pop(0)
        smooth_acc = np.mean(smoothed_accuracy)
        progress_bar.set_description(f"Loss: {outputs.loss.item():.4f}, Smoothed Acc: {smooth_acc:.4f}")

except Exception as e:
  print(f"Error occurred during evaluation: {e}")
  torch.cuda.empty_cache()  # Free up VRAM if an error occurs

# Compute evaluation metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
f1 = f1_score(all_labels, all_predictions, average='weighted')

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Eval:   1%|▌                                                                                                   | 4/649 [00:00<00:33, 19.18it/s]

[0 1 0 1]
[1 0 1 0]
tensor([False, False, False, False], device='cuda:0')


Loss: 1.4054, Smoothed Acc: 0.0000:  16%|██████████▉                                                         | 104/649 [00:05<00:27, 19.48it/s]

[0 1 1 1]
[1 0 0 0]
tensor([False, False, False, False], device='cuda:0')


Loss: 1.1435, Smoothed Acc: 0.0025:  31%|█████████████████████▎                                              | 204/649 [00:10<00:22, 19.43it/s]

[0 1 0 1]
[1 0 1 0]
tensor([False, False, False, False], device='cuda:0')


Loss: 1.2575, Smoothed Acc: 0.0000:  47%|███████████████████████████████▊                                    | 304/649 [00:15<00:17, 19.42it/s]

[0 1 0 0]
[1 0 1 1]
tensor([False, False, False, False], device='cuda:0')


Loss: 1.2026, Smoothed Acc: 0.0000:  62%|██████████████████████████████████████████▎                         | 404/649 [00:20<00:12, 19.37it/s]

[0 1 1 0]
[1 0 0 1]
tensor([False, False, False, False], device='cuda:0')


Loss: 1.1433, Smoothed Acc: 0.0025:  78%|████████████████████████████████████████████████████▊               | 504/649 [00:25<00:07, 19.39it/s]

[0 1 0 0]
[1 0 1 1]
tensor([False, False, False, False], device='cuda:0')


Loss: 1.1544, Smoothed Acc: 0.0000:  93%|███████████████████████████████████████████████████████████████▎    | 604/649 [00:31<00:02, 19.37it/s]

[1 1 0 0]
[0 0 1 1]
tensor([False, False, False, False], device='cuda:0')


Loss: 1.1638, Smoothed Acc: 0.0000: 100%|████████████████████████████████████████████████████████████████████| 649/649 [00:33<00:00, 19.44it/s]


Accuracy: 0.0008
Precision: 0.0008
Recall: 0.0008
F1-Score: 0.0008
