In [1]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set the environment variable for CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Load the dataset from the CSV file
dataset = load_dataset(
    'csv',
    data_files='datasethumanattribution.csv',
    delimiter=',',
    column_names=[
        "folder","filename","claim","label","url","premise","category","gemini_label","gemini_explanation","gpt4_label","gpt4_rationale","split"
    ],
    skiprows=1
)

# Keep only the required columns
columns_to_keep = ["claim", "premise", "label"]
all_columns = dataset['train'].column_names
columns_to_drop = [col for col in all_columns if col not in columns_to_keep]

for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_drop)

# Define label mappings
label2id = {
    "contradiction": 0,
    "entailment": 1,
    "neutral": 2
}

id2label = {v: k for k, v in label2id.items()}

label_mapping = {
    'SUPPORTED': 'entailment',
    'REFUTED': 'contradiction',
    'NOT ENOUGH INFORMATION': 'neutral'
}

def map_and_encode_labels(example):
    # Map original dataset labels to new labels ('entailment', 'contradiction', 'neutral')
    mapped_label = label_mapping.get(example['label'], 'neutral')
    # Encode mapped labels using label2id
    example['label'] = label2id[mapped_label]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(map_and_encode_labels)

# Load pre-trained tokenizer and model
model_name = "pritamdeka/PubMedBERT-MNLI-MedNLI"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the entire dataset
class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer_name='pritamdeka/PubMedBERT-MNLI-MedNLI'):
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Extracting claim and premise texts
        claim = str(item['claim']) if item['claim'] is not None else ""
        premise = str(item['premise']) if item['premise'] is not None else ""
        
        # Tokenize the texts
        inputs = self.tokenizer(
            premise, claim,
            return_tensors="pt",
            padding='max_length',
            truncation=True,
            max_length=512,
            add_special_tokens=True
        )
        
        # Remove batch dimension and add to item
        item['input_ids'] = inputs['input_ids'].squeeze(0)
        item['attention_mask'] = inputs['attention_mask'].squeeze(0)
        
        if 'label' in item:
            item['labels'] = torch.tensor(item['label'], dtype=torch.long)
        
        return item

# Convert entire dataset to MediClaimDataset
full_dataset = MediClaimDataset(dataset['train'], tokenizer_name=model_name)

# Create DataLoader for the full dataset
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=8, collate_fn=lambda x: x)

# Perform inference
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in full_loader:
        # Ensure no None values in batch
        batch = [b for b in batch if b is not None]
        input_ids = torch.stack([b['input_ids'] for b in batch]).to(device)
        attention_mask = torch.stack([b['attention_mask'] for b in batch]).to(device)
        labels = torch.stack([b['labels'] for b in batch]).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
balanced_accuracy = balanced_accuracy_score(all_labels, all_predictions)

metrics = {
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall
}

# Map predicted labels back to their original labels
predicted_labels = [id2label[label] for label in all_predictions]

# Print the predictions and metrics
print("Metrics:", metrics)


  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-6274d2e8b2aaf27b
Reusing dataset csv (/home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 1/1 [00:00<00:00, 358.79it/s]
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-11cf639ca5c8f9ee.arrow


Metrics: {'accuracy': 0.7080103359173127, 'f1': 0.6783103361821334, 'precision': 0.6621122317069292, 'recall': 0.7080103359173127}


In [3]:
import os
import torch
import numpy as np
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set the environment variable for CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Load the dataset from the CSV file
dataset = load_dataset(
    'csv',
    data_files='datasethumanattribution.csv',
    delimiter=',',
    column_names=[
        "folder","filename","claim","label","url","premise","category","gemini_label","gemini_explanation","gpt4_label","gpt4_rationale","split"
    ],
    skiprows=1
)

# Keep only the required columns
columns_to_keep = ["claim", "premise", "label"]
all_columns = dataset['train'].column_names
columns_to_drop = [col for col in all_columns if col not in columns_to_keep]

for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_drop)

# Define label mappings
label2id = {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
}

id2label = {v: k for k, v in label2id.items()}

label_mapping = {
    'SUPPORTED': 'entailment',
    'REFUTED': 'contradiction',
    'NOT ENOUGH INFORMATION': 'neutral'
}

def map_and_encode_labels(example):
    # Map original dataset labels to new labels ('entailment', 'contradiction', 'neutral')
    mapped_label = label_mapping.get(example['label'], 'neutral')
    # Encode mapped labels using label2id
    example['label'] = label2id[mapped_label]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(map_and_encode_labels)

# Load pre-trained tokenizer and model
model_name = 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the entire dataset
class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer_name):
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Extracting claim and premise texts
        claim = str(item['claim']) if item['claim'] is not None else ""
        premise = str(item['premise']) if item['premise'] is not None else ""
        
        # Tokenize the texts
        inputs = self.tokenizer(
            premise, claim,
            return_tensors="pt",
            padding='max_length',
            truncation=True,
            max_length=512,
            add_special_tokens=True
        )
        
        # Remove batch dimension and add to item
        item['input_ids'] = inputs['input_ids'].squeeze(0)
        item['attention_mask'] = inputs['attention_mask'].squeeze(0)
        
        if 'label' in item:
            item['labels'] = torch.tensor(item['label'], dtype=torch.long)
        
        return item

# Convert entire dataset to MediClaimDataset
full_dataset = MediClaimDataset(dataset['train'], tokenizer_name=model_name)

# Create DataLoader for the full dataset
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=8, collate_fn=lambda x: x)

# Perform inference
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in full_loader:
        # Ensure no None values in batch
        batch = [b for b in batch if b is not None]
        input_ids = torch.stack([b['input_ids'] for b in batch]).to(device)
        attention_mask = torch.stack([b['attention_mask'] for b in batch]).to(device)
        labels = torch.stack([b['labels'] for b in batch]).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
balanced_accuracy = balanced_accuracy_score(all_labels, all_predictions)

metrics = {
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall
}

# Map predicted labels back to their original labels
predicted_labels = [id2label[label] for label in all_predictions]

# Print the predictions and metrics
print("Metrics:", metrics)


Using custom data configuration default-6274d2e8b2aaf27b
Reusing dataset csv (/home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 1/1 [00:00<00:00, 293.06it/s]
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-c5e1f9bf51ce625f.arrow


Metrics: {'accuracy': 0.5917312661498708, 'f1': 0.6445585498615652, 'precision': 0.736051296774811, 'recall': 0.5917312661498708}


In [6]:
import os
import torch
import numpy as np
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set the environment variable for CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Load the dataset from the CSV file
dataset = load_dataset(
    'csv',
    data_files='datasethumanattribution.csv',
    delimiter=',',
    column_names=[
        "folder","filename","claim","label","url","premise","category","gemini_label","gemini_explanation","gpt4_label","gpt4_rationale","split"
    ],
    skiprows=1
)

# Keep only the required columns
columns_to_keep = ["claim", "premise", "label"]
all_columns = dataset['train'].column_names
columns_to_drop = [col for col in all_columns if col not in columns_to_keep]

for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_drop)

# Define label mappings
label2id = {
    "contradiction": 1,
    "entailment": 0,
    "neutral": 1
}

id2label = {v: k for k, v in label2id.items()}

label_mapping = {
    'SUPPORTED': 'entailment',
    'REFUTED': 'contradiction',
    'NOT ENOUGH INFORMATION': 'neutral'
}

def map_and_encode_labels(example):
    # Map original dataset labels to new labels ('entailment', 'contradiction', 'neutral')
    mapped_label = label_mapping.get(example['label'], 'neutral')
    # Encode mapped labels using label2id
    example['label'] = label2id[mapped_label]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(map_and_encode_labels)

# Load pre-trained tokenizer and model
model_name = 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the entire dataset
class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer_name):
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Extracting claim and premise texts
        claim = str(item['claim']) if item['claim'] is not None else ""
        premise = str(item['premise']) if item['premise'] is not None else ""
        
        # Tokenize the texts
        inputs = self.tokenizer(
            premise, claim,
            return_tensors="pt",
            padding='max_length',
            truncation=True,
            max_length=512,
            add_special_tokens=True
        )
        
        # Remove batch dimension and add to item
        item['input_ids'] = inputs['input_ids'].squeeze(0)
        item['attention_mask'] = inputs['attention_mask'].squeeze(0)
        
        if 'label' in item:
            item['labels'] = torch.tensor(item['label'], dtype=torch.long)
        
        return item

# Convert entire dataset to MediClaimDataset
full_dataset = MediClaimDataset(dataset['train'], tokenizer_name=model_name)

# Create DataLoader for the full dataset
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=8, collate_fn=lambda x: x)

# Perform inference
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in full_loader:
        # Ensure no None values in batch
        batch = [b for b in batch if b is not None]
        input_ids = torch.stack([b['input_ids'] for b in batch]).to(device)
        attention_mask = torch.stack([b['attention_mask'] for b in batch]).to(device)
        labels = torch.stack([b['labels'] for b in batch]).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
balanced_accuracy = balanced_accuracy_score(all_labels, all_predictions)

metrics = {
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall}

# Map predicted labels back to their original labels
predicted_labels = [id2label[label] for label in all_predictions]

# Print the predictions and metrics
print("Metrics:", metrics)


Using custom data configuration default-6274d2e8b2aaf27b
Reusing dataset csv (/home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 1/1 [00:00<00:00, 303.63it/s]
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-605dfeae61287f0b.arrow


Metrics: {'accuracy': 0.6834625322997416, 'f1': 0.6904926312050433, 'precision': 0.7498243660156471, 'recall': 0.6834625322997416}


In [7]:
import os
import torch
import numpy as np
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set the environment variable for CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Load the dataset from the CSV file
dataset = load_dataset(
    'csv',
    data_files='datasethumanattribution.csv',
    delimiter=',',
    column_names=[
        "folder","filename","claim","label","url","premise","category","gemini_label","gemini_explanation","gpt4_label","gpt4_rationale","split"
    ],
    skiprows=1
)

# Keep only the required columns
columns_to_keep = ["claim", "premise", "label"]
all_columns = dataset['train'].column_names
columns_to_drop = [col for col in all_columns if col not in columns_to_keep]

for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_drop)

# Define label mappings
label2id = {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
}

id2label = {v: k for k, v in label2id.items()}

label_mapping = {
    'SUPPORTED': 'entailment',
    'REFUTED': 'contradiction',
    'NOT ENOUGH INFORMATION': 'neutral'
}

def map_and_encode_labels(example):
    # Map original dataset labels to new labels ('entailment', 'contradiction', 'neutral')
    mapped_label = label_mapping.get(example['label'], 'neutral')
    # Encode mapped labels using label2id
    example['label'] = label2id[mapped_label]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(map_and_encode_labels)

# Load pre-trained tokenizer and model
model_name = 'bioformers/bioformer-8L-mnli'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the entire dataset
class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer_name):
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Extracting claim and premise texts
        claim = str(item['claim']) if item['claim'] is not None else ""
        premise = str(item['premise']) if item['premise'] is not None else ""
        
        # Tokenize the texts
        inputs = self.tokenizer(
            premise, claim,
            return_tensors="pt",
            padding='max_length',
            truncation=True,
            max_length=512,
            add_special_tokens=True
        )
        
        # Remove batch dimension and add to item
        item['input_ids'] = inputs['input_ids'].squeeze(0)
        item['attention_mask'] = inputs['attention_mask'].squeeze(0)
        
        if 'label' in item:
            item['labels'] = torch.tensor(item['label'], dtype=torch.long)
        
        return item

# Convert entire dataset to MediClaimDataset
full_dataset = MediClaimDataset(dataset['train'], tokenizer_name=model_name)

# Create DataLoader for the full dataset
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=8, collate_fn=lambda x: x)

# Perform inference
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in full_loader:
        # Ensure no None values in batch
        batch = [b for b in batch if b is not None]
        input_ids = torch.stack([b['input_ids'] for b in batch]).to(device)
        attention_mask = torch.stack([b['attention_mask'] for b in batch]).to(device)
        labels = torch.stack([b['labels'] for b in batch]).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
balanced_accuracy = balanced_accuracy_score(all_labels, all_predictions)

metrics = {
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall}

# Map predicted labels back to their original labels
predicted_labels = [id2label[label] for label in all_predictions]

# Print the predictions and metrics
print("Metrics:", metrics)


Using custom data configuration default-6274d2e8b2aaf27b
Reusing dataset csv (/home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 1/1 [00:00<00:00, 336.19it/s]
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-c5e1f9bf51ce625f.arrow


Metrics: {'accuracy': 0.5813953488372093, 'f1': 0.6081182747392223, 'precision': 0.6847642957212506, 'recall': 0.5813953488372093}


In [11]:
import os
import torch
import numpy as np
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set the environment variable for CUDA
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Load the dataset from the CSV file
dataset = load_dataset(
    'csv',
    data_files='datasethumanattribution.csv',
    delimiter=',',
    column_names=[
        "folder","filename","claim","label","url","premise","category","gemini_label","gemini_explanation","gpt4_label","gpt4_rationale","split"
    ],
    skiprows=1
)

# Keep only the required columns
columns_to_keep = ["claim", "premise", "label"]
all_columns = dataset['train'].column_names
columns_to_drop = [col for col in all_columns if col not in columns_to_keep]

for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_drop)

# Define label mappings
label2id = {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
}

id2label = {v: k for k, v in label2id.items()}

label_mapping = {
    'SUPPORTED': 'entailment',
    'REFUTED': 'contradiction',
    'NOT ENOUGH INFORMATION': 'neutral'
}

def map_and_encode_labels(example):
    # Map original dataset labels to new labels ('entailment', 'contradiction', 'neutral')
    mapped_label = label_mapping.get(example['label'], 'neutral')
    # Encode mapped labels using label2id
    example['label'] = label2id[mapped_label]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(map_and_encode_labels)

# Load pre-trained tokenizer and model
model_name = 'howey/electra-large-mnli'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the entire dataset
class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer_name):
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        
        # Extracting claim and premise texts
        claim = str(item['claim']) if item['claim'] is not None else ""
        premise = str(item['premise']) if item['premise'] is not None else ""
        
        # Tokenize the texts
        inputs = self.tokenizer(
            premise, claim,
            return_tensors="pt",
            padding='max_length',
            truncation=True,
            max_length=512,
            add_special_tokens=True
        )
        
        # Remove batch dimension and add to item
        item['input_ids'] = inputs['input_ids'].squeeze(0)
        item['attention_mask'] = inputs['attention_mask'].squeeze(0)
        
        if 'label' in item:
            item['labels'] = torch.tensor(item['label'], dtype=torch.long)
        
        return item

# Convert entire dataset to MediClaimDataset
full_dataset = MediClaimDataset(dataset['train'], tokenizer_name=model_name)

# Create DataLoader for the full dataset
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=8, collate_fn=lambda x: x)

# Perform inference
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in full_loader:
        # Ensure no None values in batch
        batch = [b for b in batch if b is not None]
        input_ids = torch.stack([b['input_ids'] for b in batch]).to(device)
        attention_mask = torch.stack([b['attention_mask'] for b in batch]).to(device)
        labels = torch.stack([b['labels'] for b in batch]).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')
balanced_accuracy = balanced_accuracy_score(all_labels, all_predictions)

metrics = {
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall
}

# Map predicted labels back to their original labels
predicted_labels = [id2label[label] for label in all_predictions]


print("Metrics:", metrics)


Using custom data configuration default-6274d2e8b2aaf27b
Reusing dataset csv (/home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 1/1 [00:00<00:00, 312.84it/s]
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-6274d2e8b2aaf27b/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-c5e1f9bf51ce625f.arrow
Downloading: 100%|██████████| 302/302 [00:00<00:00, 665kB/s]
Downloading: 100%|██████████| 968/968 [00:00<00:00, 2.24MB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 1.12MB/s]
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 1.52MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 253kB/s]
Downloading: 100%|██████████| 1.25G/1.25G [02:32<00:00, 8.77MB/s] 


Metrics: {'accuracy': 0.5387596899224806, 'f1': 0.6097898063384216, 'precision': 0.7969209362741386, 'recall': 0.5387596899224806}
