In [1]:
from datasets import load_dataset
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Subset
from transformers import EarlyStoppingCallback
from sklearn.model_selection import StratifiedKFold 
import numpy as np
from datasets import Dataset, DatasetDict, ClassLabel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('csv',data_files='dataset_propositionattribution_nerfeatures.csv',delimiter=',',column_names=["claim","premise","label","category","count_bf","count_ca","count_dis","count_food","count_lipid","count_treat","pres_bf","pres_ca","pres_dis","pres_food","pres_lipid","pres_treat","counte_bf","counte_ca","counte_dis","counte_food","counte_lipid","counte_treat","prese_bf","prese_ca","prese_dis","prese_food","prese_lipid","prese_treat","url", "entities","entity_map","gem_exp","gem_label","gpt_label","gpt_exp","gold_exp","entity_map_ev","entity_ev","split"],skiprows=1)

Using custom data configuration default-927ab0163adb9fdb
Reusing dataset csv (/home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 1/1 [00:00<00:00, 373.52it/s]


In [3]:
train_dataset = dataset['train'].filter(lambda example: example['split'] == 'train')
validation_dataset = dataset['train'].filter(lambda example: example['split'] == 'validation')
test_dataset = dataset['train'].filter(lambda example: example['split'] == 'test')
dataset = DatasetDict({
    'train': train_dataset,
    'val': validation_dataset,
    'test': test_dataset
})

Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-d013d5114fa105ab.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-16f9acbdd82fea07.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-47c7de469e4087a2.arrow


In [4]:
columns_to_keep = ["claim", "premise", "label"]
all_columns = dataset["train"].column_names

columns_to_drop = [col for col in all_columns if col not in columns_to_keep]
for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_drop)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 1623
    })
    val: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 465
    })
    test: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 234
    })
})

In [6]:
from datasets import load_dataset, DatasetDict

label2id = {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
}

id2label = {v: k for k, v in label2id.items()}

label_mapping = {
    'SUPPORTED': 'entailment',
    'REFUTED': 'contradiction',
    'NOT ENOUGH INFORMATION': 'neutral'
}

def map_and_encode_labels(example):
    # Map original dataset labels to new labels ('entailment', 'contradiction', 'neutral')
    mapped_label = label_mapping[example['label']]
    # Encode mapped labels using label2id
    example['label'] = label2id[mapped_label]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(map_and_encode_labels)


# Show the label encoding mapping
print("Label Encoding Mapping:", label2id)

Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-c04f7e43468f969f.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-b4575673d3585192.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-bfd24c086ca7428f.arrow


Label Encoding Mapping: {'contradiction': 2, 'entailment': 0, 'neutral': 1}


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 1623
    })
    val: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 465
    })
    test: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 234
    })
})

In [8]:
labels = np.array(dataset['train']['label'])

In [9]:
from transformers import AutoTokenizer
import torch.utils.data

class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer_name='sileod/deberta-v3-small-tasksource-nli'):
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        idx = int(idx)  # Ensure idx is an integer
        item = self.dataset[idx]  # Access the dataset item at idx
        
        # Extracting claim and evidence texts

        claim = item['claim']
        evidences = item['premise']
        item['premise']=evidences
        item['claim']=claim
        # Tokenize the texts
        inputs = self.tokenizer(
            evidences,claim,
            return_tensors="pt",  # Ensure PyTorch tensors are returned
            padding='max_length',  # Apply padding to the maximum length
            truncation='longest_first',  # Truncate to the maximum length if necessary
            max_length=512,  # Specify the maximum length
            add_special_tokens=True  # Add special tokens like [CLS], [SEP]
        )
        
        item['input_ids'] = inputs['input_ids'].squeeze()  # Remove batch dimension
        item['attention_mask']= inputs['attention_mask'].squeeze() # Remove batch dimension
        
        if 'label' in item:
            item['labels'] = torch.tensor(item['label'], dtype=torch.long)
        
        return item



In [10]:
import torch
print(torch.cuda.device_count())
print("Available GPUs:")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

1
Available GPUs:
GPU 0: Tesla V100-SXM2-32GB


In [11]:
model_name = "sileod/deberta-v3-small-tasksource-nli"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                 num_labels=3, ignore_mismatched_sizes=True)
device = "cuda:0"
model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
 

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="weighted")  # Specify average method
    recall = recall_score(labels, preds, average="weighted")  # Specify average method

    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1}

In [13]:
dataset['train']

Dataset({
    features: ['claim', 'premise', 'label'],
    num_rows: 1623
})

In [14]:
import gc

torch.cuda.set_device(0)

# Clearing the cache
torch.cuda.empty_cache()
gc.collect()
# Checking GPU memory, making sure to reset peak memory stats
torch.cuda.reset_peak_memory_stats()

In [15]:
current_device = torch.cuda.current_device()
print(f"Current CUDA device: GPU {current_device}")

Current CUDA device: GPU 0


In [16]:
train_data = dataset['train']
eval_data = dataset['val']
model = model.to('cuda:0')

In [17]:
tdata = MediClaimDataset(train_data)
vdata = MediClaimDataset(eval_data)
test_data = MediClaimDataset(dataset['test'])

In [18]:
tdata.__getitem__(0)

{'claim': 'Myrrh essential oil is sometimes used in skincare products to help improve the appearance of the skin.',
 'premise': 'The essential oils of frankincense and myrrh increase the fluidity of the lipid bilayer in the cuticle and change the orderly and dense structure to increase the permeability of the skin and decrease the barrier effect.',
 'label': 0,
 'input_ids': tensor([    1,   279,  1830,  6725,   265, 88609,   263, 98237,   993,   262,
         49462,   265,   262, 22003, 96579,   267,   262, 61462,   263,   575,
           262, 26217,   263,  9854,  1730,   264,   993,   262, 39632,   265,
           262,  1158,   263,  4843,   262,  7275,  1290,   260,     2,   573,
         52341,  1830,  1080,   269,  1359,   427,   267, 17847,   633,   264,
           408,  1300,   262,  2658,   265,   262,  1158,   260,     2,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0

In [19]:
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments,DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)


training_args = TrainingArguments(
    output_dir=f'/home/elson/6.4.1_deberta/',
    num_train_epochs=15,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    fp16=True,
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tdata,
    eval_dataset=vdata,
    #tokenizer=tokenizer,
    #data_collator = data_collator,
    compute_metrics=compute_metrics
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]
)

# Training and Evaluation
trainer.train()
eval_result = trainer.evaluate(vdata)

# Save the best model and tokenizer
model.save_pretrained(f'/home/elson/6.4.1_deberta/best_model')
tokenizer.save_pretrained(f'/home/elson/6.4.1_deberta/best_model')


Using amp half precision backend
***** Running training *****
  Num examples = 1623
  Num Epochs = 15
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 765


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6786,0.722321,0.716129,0.678612,0.716129,0.69672
2,0.2979,1.077634,0.683871,0.72252,0.683871,0.698553
3,0.1038,1.510799,0.68172,0.718953,0.68172,0.694595
4,0.0234,1.861397,0.664516,0.68909,0.664516,0.674163
5,0.015,2.107094,0.670968,0.720954,0.670968,0.687739
6,0.0466,2.287065,0.683871,0.719692,0.683871,0.696561
7,0.0089,2.468571,0.647312,0.715179,0.647312,0.666502
8,0.001,2.302196,0.703226,0.731457,0.703226,0.711222
9,0.0067,2.418575,0.698925,0.729703,0.698925,0.709031
10,0.0001,2.599059,0.68172,0.731184,0.68172,0.696529


***** Running Evaluation *****
  Num examples = 465
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /home/elson/6.4.1_deberta/checkpoint-51
Configuration saved in /home/elson/6.4.1_deberta/checkpoint-51/config.json
Model weights saved in /home/elson/6.4.1_deberta/checkpoint-51/pytorch_model.bin
Deleting older checkpoint [/home/elson/6.4.1_deberta/checkpoint-203] due to args.save_total_limit
Deleting older checkpoint [/home/elson/6.4.1_deberta/checkpoint-406] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 465
  Batch size = 32
Saving model checkpoint to /home/elson/6.4.1_deberta/checkpoint-102
Configuration saved in /home/elson/6.4.1_deberta/checkpoint-102/config.json
Model weights saved in /home/elson/6.4.1_deberta/checkpoint-102/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 465
  Batch size = 32
Saving model checkpoint to /home/elson/6.4.1_deberta/checkpoint-153
Configuration saved in

  _warn_prf(average, modifier, msg_start, len(result))
Configuration saved in /home/elson/6.4.1_deberta/best_model/config.json
Model weights saved in /home/elson/6.4.1_deberta/best_model/pytorch_model.bin
tokenizer config file saved in /home/elson/6.4.1_deberta/best_model/tokenizer_config.json
Special tokens file saved in /home/elson/6.4.1_deberta/best_model/special_tokens_map.json
added tokens file saved in /home/elson/6.4.1_deberta/best_model/added_tokens.json


('/home/elson/6.4.1_deberta/best_model/tokenizer_config.json',
 '/home/elson/6.4.1_deberta/best_model/special_tokens_map.json',
 '/home/elson/6.4.1_deberta/best_model/spm.model',
 '/home/elson/6.4.1_deberta/best_model/added_tokens.json')

In [20]:
model_path = "/home/elson/6.4.1_deberta/best_model/"
model = AutoModelForSequenceClassification.from_pretrained(model_path).to('cuda:0')

# Evaluate on the test set
test_results = trainer.predict(test_data)

loading configuration file /home/elson/6.4.1_deberta/best_model/config.json
Model config DebertaV2Config {
  "_name_or_path": "/home/elson/6.4.1_deberta/best_model/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifiers_size": [
    3,
    2,
    2,
    2,
    2,
    2,
    1,
    2,
    3,
    2,
    2,
    2,
    3,
    3,
    3,
    3,
    1,
    3,
    3,
    2,
    2,
    3,
    6,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    2,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    2,
    2,
    2,
    2,
    5,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    2,
    2,
    2,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    3,
    2,
    2,
    2,
    2,
    47,
    23,
    9,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
 

loading weights file /home/elson/6.4.1_deberta/best_model/pytorch_model.bin
All model checkpoint weights were used when initializing DebertaV2ForSequenceClassification.

All the weights of DebertaV2ForSequenceClassification were initialized from the model checkpoint at /home/elson/6.4.1_deberta/best_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaV2ForSequenceClassification for predictions without further training.
***** Running Prediction *****
  Num examples = 234
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
print(test_results)

PredictionOutput(predictions=array([[ 6.2744e-01, -7.3584e-01, -2.1985e-01],
       [ 1.2070e+00, -6.5918e-01, -7.6270e-01],
       [ 2.3340e+00, -1.4180e+00, -1.3223e+00],
       [-9.4287e-01, -1.2998e+00,  1.8828e+00],
       [ 1.4688e+00, -9.7998e-01, -1.0938e+00],
       [ 2.6211e+00, -9.0869e-01, -2.2363e+00],
       [ 1.0830e+00, -1.1240e+00, -3.3130e-01],
       [ 1.5762e+00, -6.4014e-01, -1.4180e+00],
       [ 1.5059e+00, -1.2441e+00, -1.0137e+00],
       [ 1.4033e+00, -1.0693e+00, -5.9521e-01],
       [ 1.8330e+00, -1.3877e+00, -8.9893e-01],
       [ 2.2188e+00, -9.2188e-01, -1.8867e+00],
       [ 8.0957e-01, -7.4219e-01, -5.4785e-01],
       [ 2.1504e+00, -7.6074e-01, -1.8037e+00],
       [ 1.1816e+00, -2.9346e-01, -9.7803e-01],
       [-1.2219e-01, -1.0283e+00,  8.2129e-01],
       [ 1.4375e+00, -5.6055e-01, -1.1973e+00],
       [ 1.8135e+00, -5.6885e-01, -1.6172e+00],
       [ 2.5195e+00, -1.1123e+00, -1.9326e+00],
       [ 2.4355e+00, -1.1504e+00, -1.7188e+00],
       [ 5.