NOTE: Update the `model_paths` list below to match your system.

These are absolute paths from the author's machine. 
If you're running this on your own setup or downloaded models from the GitHub repo, make sure you point to the correct local directories.

You can use the following:

import os

base_model_dir = "saved_models"
model_paths = [
    os.path.join(base_model_dir, "finetuned_0/checkpoint-357"),
    os.path.join(base_model_dir, "finetuned_1/checkpoint-612"),
    os.path.join(base_model_dir, "finetuned_2/checkpoint-357"),
    os.path.join(base_model_dir, "finetuned_3/checkpoint-51"),
    os.path.join(base_model_dir, "finetuned_4/checkpoint-357"),
    os.path.join(base_model_dir, "finetuned_5/checkpoint-306"),
    os.path.join(base_model_dir, "finetuned_6/checkpoint-663"),
    os.path.join(base_model_dir, "finetuned_7/checkpoint-1020"),
    os.path.join(base_model_dir, "finetuned_1/checkpoint-663"),
]

In [2]:
import torch
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)

############################
# 1) DEFINE MODEL PATHS
############################
model_paths = [
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_0/checkpoint-357",
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_1/checkpoint-612/",
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_2/checkpoint-357/",
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_3/checkpoint-51/",
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_4/checkpoint-357/",
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_5/checkpoint-306/",
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_6/checkpoint-663/",
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_7/checkpoint-1020/",
    "/home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_8/checkpoint-663/"]

device = "cuda" if torch.cuda.is_available() else "cpu"

############################
# 2) LOAD DATASET ONCE
############################
df = pd.read_csv("TR_OPETs.csv")  # or TR_OPETs, etc.
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  
print("Dataset loaded with", len(df), "samples.")

texts = df["text"].tolist()
labels = df["label"].tolist()

############################
# 3) PREPARE TO STORE METRICS
############################
acc_scores = []
f1_scores = []
prec_scores = []
rec_scores = []

############################
# 4) TOKENIZER (FROM BASE)
############################
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

############################
# 5) LOOP OVER MODEL PATHS
############################
for model_path in model_paths:
    print(f"\n===== Loading fine-tuned model from: {model_path} =====")

    # Load the fine-tuned XLM-R model
    model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
    model.eval()
    model.to(device)

    ########################################
    # Generate embeddings for each example
    ########################################
    feature_list = []
    for text in tqdm(texts, desc="Extracting embeddings", leave=False):
        inputs = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
            max_length=512
        ).to(device)

        with torch.no_grad():
            # We'll extract the base roberta embeddings
            # (like your approach: model.roberta(**inputs))
            outputs = model.roberta(**inputs)  
            last_hidden_state = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
            # Mean pooling
            embedding = last_hidden_state.mean(dim=1).squeeze()
        
        feature_list.append(embedding.cpu().numpy())

    X = np.array(feature_list, dtype=np.float32)
    y = np.array(labels)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Logistic Regression training
    lr_clf = LogisticRegression(random_state=42, solver='liblinear')
    lr_clf.fit(X_train, y_train)

    # Predictions
    y_pred = lr_clf.predict(X_test)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average='macro')
    prec = precision_score(y_test, y_pred, average='macro')
    rec  = recall_score(y_test, y_pred, average='macro')

    print(f"Model: {model_path}")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")

    # Store results
    acc_scores.append(acc)
    f1_scores.append(f1)
    prec_scores.append(prec)
    rec_scores.append(rec)

############################
# 6) PRINT AVERAGE METRICS
############################
print("\n===== Final Averages Across All Models =====")
avg_acc  = np.mean(acc_scores)
avg_f1   = np.mean(f1_scores)
avg_prec = np.mean(prec_scores)
avg_rec  = np.mean(rec_scores)

print(f"Avg Accuracy:  {avg_acc:.4f}")
print(f"Avg F1 Score:  {avg_f1:.4f}")
print(f"Avg Precision: {avg_prec:.4f}")
print(f"Avg Recall:    {avg_rec:.4f}")

2025-04-01 16:44:42.926696: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-01 16:44:43.547486: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-01 16:44:43.773020: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-01 16:44:49.577296: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

Dataset loaded with 1130 samples.

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_0/checkpoint-357 =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_0/checkpoint-357
  Accuracy:  0.8053
  F1 Score:  0.8052
  Precision: 0.8068
  Recall:    0.8056

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_1/checkpoint-612/ =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_1/checkpoint-612/
  Accuracy:  0.7788
  F1 Score:  0.7787
  Precision: 0.7787
  Recall:    0.7787

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_2/checkpoint-357/ =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_2/checkpoint-357/
  Accuracy:  0.7788
  F1 Score:  0.7787
  Precision: 0.7795
  Recall:    0.7790

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_3/checkpoint-51/ =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_3/checkpoint-51/
  Accuracy:  0.6947
  F1 Score:  0.6947
  Precision: 0.6947
  Recall:    0.6947

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_4/checkpoint-357/ =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_4/checkpoint-357/
  Accuracy:  0.7876
  F1 Score:  0.7876
  Precision: 0.7877
  Recall:    0.7877

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_5/checkpoint-306/ =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_5/checkpoint-306/
  Accuracy:  0.7699
  F1 Score:  0.7698
  Precision: 0.7706
  Recall:    0.7701

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_6/checkpoint-663/ =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_6/checkpoint-663/
  Accuracy:  0.7699
  F1 Score:  0.7695
  Precision: 0.7729
  Recall:    0.7704

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_7/checkpoint-1020/ =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_7/checkpoint-1020/
  Accuracy:  0.7965
  F1 Score:  0.7965
  Precision: 0.7965
  Recall:    0.7965

===== Loading fine-tuned model from: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_8/checkpoint-663/ =====


                                                                          

Model: /home/biyikh/Euphemisms/OPET_NOPET_experiment/tr_train_xlmr_NOPETs_splits/saved_models/finetuned_8/checkpoint-663/
  Accuracy:  0.8053
  F1 Score:  0.8053
  Precision: 0.8056
  Recall:    0.8055

===== Final Averages Across All Models =====
Avg Accuracy:  0.7763
Avg F1 Score:  0.7762
Avg Precision: 0.7770
Avg Recall:    0.7765
