In [1]:
from datasets import load_dataset
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Subset
from transformers import EarlyStoppingCallback
from sklearn.model_selection import StratifiedKFold 
import numpy as np
from datasets import Dataset, DatasetDict, ClassLabel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('csv',data_files='dataset_sentenceattribution_nerfeatures_split.csv',delimiter=',',column_names=["claim","premise","label","category","count_bf","count_ca","count_dis","count_food","count_lipid","count_treat","pres_bf","pres_ca","pres_dis","pres_food","pres_lipid","pres_treat","counte_bf","counte_ca","counte_dis","counte_food","counte_lipid","counte_treat","prese_bf","prese_ca","prese_dis","prese_food","prese_lipid","prese_treat","url", "entities","entity_map","gem_exp","gem_label","gpt_label","gpt_exp","gold_exp","entity_map_ev","entity_ev","split"],skiprows=1)

Using custom data configuration default-cac7855410b939be
Reusing dataset csv (/home/elson/.cache/huggingface/datasets/csv/default-cac7855410b939be/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 1/1 [00:00<00:00, 314.96it/s]


In [3]:
train_dataset = dataset['train'].filter(lambda example: example['split'] == 'train')
validation_dataset = dataset['train'].filter(lambda example: example['split'] == 'validation')
test_dataset = dataset['train'].filter(lambda example: example['split'] == 'test')
dataset = DatasetDict({
    'train': train_dataset,
    'val': validation_dataset,
    'test': test_dataset
})

Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-cac7855410b939be/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-a932df0b74e1547a.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-cac7855410b939be/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-926cef8097c290d1.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-cac7855410b939be/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-2ec17003ffb0a917.arrow


In [4]:
columns_to_keep = ["claim", "premise", "label"]
all_columns = dataset["train"].column_names

columns_to_drop = [col for col in all_columns if col not in columns_to_keep]
for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_drop)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 1623
    })
    val: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 465
    })
    test: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 234
    })
})

In [6]:
from datasets import load_dataset, DatasetDict

label2id = {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
}

id2label = {v: k for k, v in label2id.items()}

label_mapping = {
    'SUPPORTED': 'entailment',
    'REFUTED': 'contradiction',
    'NOT ENOUGH INFORMATION': 'neutral'
}

def map_and_encode_labels(example):
    # Map original dataset labels to new labels ('entailment', 'contradiction', 'neutral')
    mapped_label = label_mapping[example['label']]
    # Encode mapped labels using label2id
    example['label'] = label2id[mapped_label]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(map_and_encode_labels)

# Show the label encoding mapping
print("Label Encoding Mapping:", label2id)

Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-cac7855410b939be/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-25128b691fbb2a7a.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-cac7855410b939be/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-3cf6de4ed2bf1a77.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-cac7855410b939be/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-69ecd84e19cd49e7.arrow


Label Encoding Mapping: {'contradiction': 2, 'entailment': 0, 'neutral': 1}


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 1623
    })
    val: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 465
    })
    test: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 234
    })
})

In [8]:
labels = np.array(dataset['train']['label'])

In [9]:
from transformers import AutoTokenizer
import torch.utils.data

class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer_name='sileod/deberta-v3-base-tasksource-nli'):
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        idx = int(idx)  # Ensure idx is an integer
        item = self.dataset[idx]  # Access the dataset item at idx
        
        # Extracting claim and evidence texts

        claim = item['claim'].lower()
        evidences = item['premise'].lower()
        item['premise']=evidences
        item['claim']=claim
        # Tokenize the texts
        inputs = self.tokenizer(
            claim, evidences,
            return_tensors="pt",  # Ensure PyTorch tensors are returned
            padding='max_length',  # Apply padding to the maximum length
            truncation='longest_first',  # Truncate to the maximum length if necessary
            max_length=512,  # Specify the maximum length
            add_special_tokens=True  # Add special tokens like [CLS], [SEP]
        )
        
        item['input_ids'] = inputs['input_ids'].squeeze()  # Remove batch dimension
        item['attention_mask']= inputs['attention_mask'].squeeze() # Remove batch dimension
        
        if 'label' in item:
            item['labels'] = torch.tensor(item['label'], dtype=torch.long)
        
        return item



In [10]:
import torch
print(torch.cuda.device_count())
print("Available GPUs:")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

1
Available GPUs:
GPU 0: Tesla V100-SXM2-32GB


In [11]:
model_name = "sileod/deberta-v3-base-tasksource-nli"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                 num_labels=3, ignore_mismatched_sizes=True)
device = "cuda:0"
model.to(device)

Downloading: 100%|██████████| 1.25k/1.25k [00:00<00:00, 2.09MB/s]
Downloading: 100%|██████████| 2.35M/2.35M [00:00<00:00, 27.3MB/s]
Downloading: 100%|██████████| 23.0/23.0 [00:00<00:00, 66.2kB/s]
Downloading: 100%|██████████| 286/286 [00:00<00:00, 665kB/s]
Downloading: 100%|██████████| 18.1k/18.1k [00:00<00:00, 11.1MB/s]
Downloading: 100%|██████████| 704M/704M [00:22<00:00, 32.2MB/s] 


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
 

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="weighted")  # Specify average method
    recall = recall_score(labels, preds, average="weighted")  # Specify average method

    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1}

In [13]:
dataset['train']

Dataset({
    features: ['claim', 'premise', 'label'],
    num_rows: 1623
})

In [14]:
import gc

torch.cuda.set_device(0)

# Clearing the cache
torch.cuda.empty_cache()
gc.collect()
# Checking GPU memory, making sure to reset peak memory stats
torch.cuda.reset_peak_memory_stats()

In [15]:
current_device = torch.cuda.current_device()
print(f"Current CUDA device: GPU {current_device}")

Current CUDA device: GPU 0


In [16]:
train_data = dataset['train']
eval_data = dataset['val']
model = model.to('cuda:0')

In [17]:
tdata = MediClaimDataset(train_data)
vdata = MediClaimDataset(eval_data)
test_data = MediClaimDataset(dataset['test'])

In [18]:
tdata.__getitem__(0)

{'claim': 'myrrh essential oil is sometimes used in skincare products to help improve the appearance of the skin.',
 'premise': 'additionally, laser doppler blood flow measurement showed that the frankincense and myrrh essential oil compound could promote the elimination of capillaries from skin epidermis to dermis by increasing skin blood flow.',
 'label': 0,
 'input_ids': tensor([    1, 98237,  1830,  1080,   269,  1359,   427,   267, 17847,   633,
           264,   408,  1300,   262,  2658,   265,   262,  1158,   260,     2,
          7229,   261,  5042,   333, 73325,  1452,  2155,  6116,  1938,   272,
           262, 88609,   263, 98237,  1830,  1080,  7355,   387,  2655,   262,
         12682,   265, 65008,   292,  1158, 53245,   264, 75840,   293,  2376,
          1158,  1452,  2155,   260,     2,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,

In [19]:
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments,DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)


training_args = TrainingArguments(
    output_dir=f'/home/elson/3.1.1_deberta/',
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    fp16=True,
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tdata,
    eval_dataset=vdata,
    #tokenizer=tokenizer,
    #data_collator = data_collator,
    compute_metrics=compute_metrics
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]
)

# Training and Evaluation
trainer.train()
eval_result = trainer.evaluate(vdata)

# Save the best model and tokenizer
model.save_pretrained(f'/home/elson/3.1.1_deberta/best_model')
tokenizer.save_pretrained(f'/home/elson/3.1.1_deberta/best_model')


Using amp half precision backend
***** Running training *****
  Num examples = 1623
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2040


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6462,0.737564,0.694624,0.667608,0.694624,0.679718
2,0.2757,1.232224,0.658065,0.72618,0.658065,0.685273
3,0.1754,1.929995,0.68172,0.702123,0.68172,0.690928
4,0.043,2.323722,0.658065,0.701921,0.658065,0.676505
5,0.1194,2.671245,0.649462,0.695906,0.649462,0.669599
6,0.0007,2.874915,0.675269,0.708821,0.675269,0.68968
7,0.0058,3.299494,0.636559,0.719647,0.636559,0.669212
8,0.0001,2.851088,0.686022,0.699941,0.686022,0.69264
9,0.0,3.019865,0.67957,0.693807,0.67957,0.686099
10,0.0,3.143235,0.68172,0.699316,0.68172,0.689967


***** Running Evaluation *****
  Num examples = 465
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /home/elson/3.1.1_deberta/checkpoint-102
Configuration saved in /home/elson/3.1.1_deberta/checkpoint-102/config.json
Model weights saved in /home/elson/3.1.1_deberta/checkpoint-102/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 465
  Batch size = 16
Saving model checkpoint to /home/elson/3.1.1_deberta/checkpoint-204
Configuration saved in /home/elson/3.1.1_deberta/checkpoint-204/config.json
Model weights saved in /home/elson/3.1.1_deberta/checkpoint-204/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 465
  Batch size = 16
Saving model checkpoint to /home/elson/3.1.1_deberta/checkpoint-306
Configuration saved in /home/elson/3.1.1_deberta/checkpoint-306/config.json
Model weights saved in /home/elson/3.1.1_deberta/checkpoint-306/pytorch_model.bin
Deleting older checkpoint [/home/elson/3.1.1_deberta/chec

***** Running Evaluation *****
  Num examples = 465
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))
Configuration saved in /home/elson/3.1.1_deberta/best_model/config.json
Model weights saved in /home/elson/3.1.1_deberta/best_model/pytorch_model.bin
tokenizer config file saved in /home/elson/3.1.1_deberta/best_model/tokenizer_config.json
Special tokens file saved in /home/elson/3.1.1_deberta/best_model/special_tokens_map.json
added tokens file saved in /home/elson/3.1.1_deberta/best_model/added_tokens.json


('/home/elson/3.1.1_deberta/best_model/tokenizer_config.json',
 '/home/elson/3.1.1_deberta/best_model/special_tokens_map.json',
 '/home/elson/3.1.1_deberta/best_model/spm.model',
 '/home/elson/3.1.1_deberta/best_model/added_tokens.json')

In [20]:
model_path = "/home/elson/3.1.1_deberta/best_model/"
model = AutoModelForSequenceClassification.from_pretrained(model_path).to('cuda:0')

# Evaluate on the test set
test_results = trainer.predict(test_data)

loading configuration file /home/elson/3.1.1_deberta/best_model/config.json
Model config DebertaV2Config {
  "_name_or_path": "/home/elson/3.1.1_deberta/best_model/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifiers_size": [
    3,
    2,
    2,
    2,
    2,
    2,
    1,
    2,
    3,
    2,
    2,
    2,
    3,
    3,
    3,
    3,
    1,
    3,
    3,
    2,
    2,
    3,
    2,
    6,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    2,
    2,
    2,
    2,
    5,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    2,
    2,
    2,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    2,
    2,
    2,
    2,
    47,
    23,
    9,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
 

loading weights file /home/elson/3.1.1_deberta/best_model/pytorch_model.bin
All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /home/elson/3.1.1_deberta/best_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.
***** Running Prediction *****
  Num examples = 234
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
print(test_results)

PredictionOutput(predictions=array([[ 0.9165  , -0.674   , -0.4978  ],
       [ 1.725   , -0.613   , -1.671   ],
       [ 2.273   , -1.712   , -2.727   ],
       [ 0.3335  , -1.218   ,  1.063   ],
       [ 1.139   , -0.753   , -0.7437  ],
       [ 2.795   , -1.657   , -3.553   ],
       [ 1.635   , -1.579   , -1.06    ],
       [ 1.871   , -1.755   , -1.291   ],
       [ 1.676   , -1.233   , -1.953   ],
       [ 1.0625  , -1.098   , -0.697   ],
       [ 2.117   , -1.922   , -1.586   ],
       [ 1.763   , -1.367   , -1.068   ],
       [ 0.8296  , -0.959   , -0.0659  ],
       [ 1.932   , -1.916   , -1.352   ],
       [ 1.125   , -1.045   , -0.3694  ],
       [-0.03354 , -1.078   ,  1.443   ],
       [ 1.513   , -0.552   , -1.976   ],
       [ 1.781   , -0.979   , -1.643   ],
       [ 2.385   , -1.611   , -2.086   ],
       [ 0.7446  , -1.156   , -0.02411 ],
       [ 1.093   , -0.6875  , -1.053   ],
       [ 0.5176  , -0.3215  , -0.64    ],
       [ 1.228   , -0.657   , -1.289   ],
     