In [1]:
from datasets import load_dataset
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Subset
from transformers import EarlyStoppingCallback
from sklearn.model_selection import StratifiedKFold 
import numpy as np
from datasets import Dataset, DatasetDict, ClassLabel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('csv',data_files='dataset_propositionattribution_nerfeatures.csv',delimiter=',',column_names=["claim","premise","label","category","count_bf","count_ca","count_dis","count_food","count_lipid","count_treat","pres_bf","pres_ca","pres_dis","pres_food","pres_lipid","pres_treat","counte_bf","counte_ca","counte_dis","counte_food","counte_lipid","counte_treat","prese_bf","prese_ca","prese_dis","prese_food","prese_lipid","prese_treat","url", "entities","entity_map","gem_exp","gem_label","gpt_label","gpt_exp","gold_exp","entity_map_ev","entity_ev","split"],skiprows=1)

Using custom data configuration default-927ab0163adb9fdb
Reusing dataset csv (/home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 1/1 [00:00<00:00, 244.39it/s]


In [3]:
train_dataset = dataset['train'].filter(lambda example: example['split'] == 'train')
validation_dataset = dataset['train'].filter(lambda example: example['split'] == 'validation')
test_dataset = dataset['train'].filter(lambda example: example['split'] == 'test')
dataset = DatasetDict({
    'train': train_dataset,
    'val': validation_dataset,
    'test': test_dataset
})

Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-d013d5114fa105ab.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-16f9acbdd82fea07.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-47c7de469e4087a2.arrow


In [4]:
columns_to_keep = ["claim", "premise", "label"]
all_columns = dataset["train"].column_names

columns_to_drop = [col for col in all_columns if col not in columns_to_keep]
for split in dataset.keys():
    dataset[split] = dataset[split].remove_columns(columns_to_drop)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 1623
    })
    val: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 465
    })
    test: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 234
    })
})

In [6]:
from datasets import load_dataset, DatasetDict

label2id = {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
}

id2label = {v: k for k, v in label2id.items()}

label_mapping = {
    'SUPPORTED': 'entailment',
    'REFUTED': 'contradiction',
    'NOT ENOUGH INFORMATION': 'neutral'
}

def map_and_encode_labels(example):
    # Map original dataset labels to new labels ('entailment', 'contradiction', 'neutral')
    mapped_label = label_mapping[example['label']]
    # Encode mapped labels using label2id
    example['label'] = label2id[mapped_label]
    return example

for split in dataset.keys():
    dataset[split] = dataset[split].map(map_and_encode_labels)


# Show the label encoding mapping
print("Label Encoding Mapping:", label2id)

Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-c04f7e43468f969f.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-b4575673d3585192.arrow
Loading cached processed dataset at /home/elson/.cache/huggingface/datasets/csv/default-927ab0163adb9fdb/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-bfd24c086ca7428f.arrow


Label Encoding Mapping: {'contradiction': 2, 'entailment': 0, 'neutral': 1}


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 1623
    })
    val: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 465
    })
    test: Dataset({
        features: ['claim', 'premise', 'label'],
        num_rows: 234
    })
})

In [8]:
labels = np.array(dataset['train']['label'])

In [9]:
from transformers import AutoTokenizer
import torch.utils.data

class MediClaimDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer_name='ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli'):
        self.dataset = dataset
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        idx = int(idx)  # Ensure idx is an integer
        item = self.dataset[idx]  # Access the dataset item at idx
        
        # Extracting claim and evidence texts

        claim = item['claim']
        evidences = item['premise']
        item['premise']=evidences
        item['claim']=claim
        # Tokenize the texts
        inputs = self.tokenizer(
            evidences,claim,
            return_tensors="pt",  # Ensure PyTorch tensors are returned
            padding='max_length',  # Apply padding to the maximum length
            truncation='longest_first',  # Truncate to the maximum length if necessary
            max_length=512,  # Specify the maximum length
            add_special_tokens=True  # Add special tokens like [CLS], [SEP]
        )
        
        item['input_ids'] = inputs['input_ids'].squeeze()  # Remove batch dimension
        item['attention_mask']= inputs['attention_mask'].squeeze() # Remove batch dimension
        
        if 'label' in item:
            item['labels'] = torch.tensor(item['label'], dtype=torch.long)
        
        return item



In [10]:
import torch
print(torch.cuda.device_count())
print("Available GPUs:")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

1
Available GPUs:
GPU 0: Tesla V100-SXM2-32GB


In [11]:
model_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=256)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                 num_labels=3, ignore_mismatched_sizes=True)
device = "cuda:0"
model.to(device)

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
         

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="weighted")  # Specify average method
    recall = recall_score(labels, preds, average="weighted")  # Specify average method

    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1}

In [13]:
dataset['train']

Dataset({
    features: ['claim', 'premise', 'label'],
    num_rows: 1623
})

In [14]:
import gc

torch.cuda.set_device(0)

# Clearing the cache
torch.cuda.empty_cache()
gc.collect()
# Checking GPU memory, making sure to reset peak memory stats
torch.cuda.reset_peak_memory_stats()

In [15]:
current_device = torch.cuda.current_device()
print(f"Current CUDA device: GPU {current_device}")

Current CUDA device: GPU 0


In [16]:
train_data = dataset['train']
eval_data = dataset['val']
model = model.to('cuda:0')

In [17]:
tdata = MediClaimDataset(train_data)
vdata = MediClaimDataset(eval_data)
test_data = MediClaimDataset(dataset['test'])

In [18]:
tdata.__getitem__(0)

{'claim': 'Myrrh essential oil is sometimes used in skincare products to help improve the appearance of the skin.',
 'premise': 'The essential oils of frankincense and myrrh increase the fluidity of the lipid bilayer in the cuticle and change the orderly and dense structure to increase the permeability of the skin and decrease the barrier effect.',
 'label': 0,
 'input_ids': tensor([    0,   133,  4499, 15623,     9, 26486,  3976,  9401,     8,   127,
           338, 20378,   712,     5, 12293,  1571,     9,     5, 44278, 31617,
         19777,    11,     5,   847, 11317,     8,   464,     5, 27193,     8,
         19790,  3184,     7,   712,     5, 31582,  4484,     9,     5,  3024,
             8,  7280,     5,  9639,  1683,     4,     2,     2,   448,  4503,
         20378,  4499,   681,    16,  2128,   341,    11,  2972,  3976,  1322,
           785,     7,   244,  1477,     5,  2772,     9,     5,  3024,     4,
             2,     1,     1,     1,     1,     1,     1,     1,     1

In [19]:
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments,DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)


training_args = TrainingArguments(
    output_dir=f'/home/elson/6.4.1_deberta/',
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    fp16=True,
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tdata,
    eval_dataset=vdata,
    #tokenizer=tokenizer,
    #data_collator = data_collator,
    compute_metrics=compute_metrics
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)]
)

# Training and Evaluation
trainer.train()
eval_result = trainer.evaluate(vdata)

# Save the best model and tokenizer
model.save_pretrained(f'/home/elson/6.4.1_deberta/best_model')
tokenizer.save_pretrained(f'/home/elson/6.4.1_deberta/best_model')


Using amp half precision backend
***** Running training *****
  Num examples = 1623
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3045


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9599,0.793858,0.688172,0.476085,0.688172,0.562811
2,0.7555,1.143354,0.623656,0.686317,0.623656,0.645462
3,0.5746,0.933754,0.619355,0.64567,0.619355,0.630988
4,0.5734,1.783974,0.651613,0.682559,0.651613,0.665289
5,0.4254,1.982377,0.615054,0.698114,0.615054,0.645602
6,0.4109,1.732087,0.694624,0.7123,0.694624,0.698677
7,0.167,2.074262,0.692473,0.715305,0.692473,0.698814
8,0.0732,2.207316,0.692473,0.703145,0.692473,0.697507
9,0.0784,2.647518,0.666667,0.716216,0.666667,0.68567
10,0.0936,2.534704,0.692473,0.714796,0.692473,0.702774


***** Running Evaluation *****
  Num examples = 465
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /home/elson/6.4.1_deberta/checkpoint-203
Configuration saved in /home/elson/6.4.1_deberta/checkpoint-203/config.json
Model weights saved in /home/elson/6.4.1_deberta/checkpoint-203/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 465
  Batch size = 8
Saving model checkpoint to /home/elson/6.4.1_deberta/checkpoint-406
Configuration saved in /home/elson/6.4.1_deberta/checkpoint-406/config.json
Model weights saved in /home/elson/6.4.1_deberta/checkpoint-406/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 465
  Batch size = 8
Saving model checkpoint to /home/elson/6.4.1_deberta/checkpoint-609
Configuration saved in /home/elson/6.4.1_deberta/checkpoint-609/config.json
Model weights saved in /home/elson/6.4.1_deberta/checkpoint-609/pytorch_model.bin
Deleting older checkpoint [/home/elson/6.4.1_deberta/checkpo

Configuration saved in /home/elson/6.4.1_deberta/best_model/config.json
Model weights saved in /home/elson/6.4.1_deberta/best_model/pytorch_model.bin
tokenizer config file saved in /home/elson/6.4.1_deberta/best_model/tokenizer_config.json
Special tokens file saved in /home/elson/6.4.1_deberta/best_model/special_tokens_map.json


('/home/elson/6.4.1_deberta/best_model/tokenizer_config.json',
 '/home/elson/6.4.1_deberta/best_model/special_tokens_map.json',
 '/home/elson/6.4.1_deberta/best_model/vocab.json',
 '/home/elson/6.4.1_deberta/best_model/merges.txt',
 '/home/elson/6.4.1_deberta/best_model/added_tokens.json',
 '/home/elson/6.4.1_deberta/best_model/tokenizer.json')

In [20]:
model_path = "/home/elson/6.4.1_deberta/best_model/"
model = AutoModelForSequenceClassification.from_pretrained(model_path).to('cuda:0')

# Evaluate on the test set
test_results = trainer.predict(test_data)

loading configuration file /home/elson/6.4.1_deberta/best_model/config.json
Model config RobertaConfig {
  "_name_or_path": "/home/elson/6.4.1_deberta/best_model/",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0"

In [21]:
print(test_results)

PredictionOutput(predictions=array([[ 7.062  , -2.996  , -3.918  ],
       [ 7.324  , -3.145  , -3.898  ],
       [ 6.3    , -3.295  , -3.191  ],
       [-3.309  , -1.843  ,  5.914  ],
       [ 6.41   , -3.428  , -3.11   ],
       [ 7.336  , -3.26   , -3.809  ],
       [ 7.34   , -3.256  , -3.79   ],
       [ 7.336  , -3.236  , -3.832  ],
       [ 7.33   , -3.203  , -3.861  ],
       [ 7.156  , -3.576  , -3.393  ],
       [ 7.24   , -3.488  , -3.525  ],
       [ 7.344  , -3.215  , -3.842  ],
       [ 7.293  , -3.367  , -3.68   ],
       [ 7.293  , -3.172  , -3.89   ],
       [ 6.492  , -2.398  , -4.29   ],
       [ 7.164  , -3.36   , -3.648  ],
       [ 7.273  , -2.996  , -4.004  ],
       [ 6.516  , -3.498  , -3.166  ],
       [ 7.33   , -3.178  , -3.889  ],
       [ 6.98   , -3.584  , -3.338  ],
       [ 7.31   , -3.268  , -3.768  ],
       [ 7.29   , -3.273  , -3.729  ],
       [ 6.355  , -1.873  , -4.703  ],
       [-2.48   , -1.823  ,  5.254  ],
       [ 4.38   , -3.354  , -1.575 