TODO:
- change output to {0: 'contradiction', 1: 'neutral', 2: 'entailment'} format instead of tokenized encoding
- how send model/data to gpu

In [1]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()
    
print_gpu_utilization()

GPU memory occupied: 390 MB.


In [2]:
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
from datasets import Dataset
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm
print_gpu_utilization()

2023-03-08 09:50:34.963379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-08 09:50:35.089406: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-08 09:50:35.691799: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/amazon/efa/lib64:/opt/amazon/openmpi/lib64:/usr/local/cuda/efa/lib:/usr/local/cuda

GPU memory occupied: 390 MB.


In [3]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={
    "train": "/root/data/chex_train.csv",
    "val": "/root/data/chex_val.csv",
    "test": "/root/data/chex_test.csv",
})
dataset
print_gpu_utilization()

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/3 [00:00<?, ?it/s]

GPU memory occupied: 390 MB.


In [4]:
dataset['train'][3] # contains Atelectasis, Pleural Effusion, and Fracture

{'Unnamed: 0': 8231,
 'Report Impression': "1.  Extensive cecal wall thickening and inflammatory changes with suspected pneumatosis and evidence of extraluminal mesenteric gas, and trace portal venous gas, in keeping with bowel ischemia. No frank disruption in the bowel contour is seen on noncontrast images. No abscess or drainable fluid collection. 2.  Normal short appendix. 3.  Moderate-sized bilateral pleural effusions with a partially visualized nodular opacity in the right middle lobe, likely representing focal atelectasis. Other less likely etiologies include consolidation or pulmonary nodule, and when the patient's status improves, further assessment with CT chest could be considered. 4.  Compression fracture of L1 with bony retropulsion. This is new from the radiographs of 2/3/2019, but still appears chronic. Correlation with point tenderness recommended. Dr. Li discussed these findings with Dr. Cohen via telephone on 9/19/2020 at 4:10 AM..",
 'Enlarged Cardiomediastinum': None

In [5]:
dataset['train'][3]['Fracture'] == 1

True

In [6]:
labels = ["Fracture", "Edema", "Cardiomegaly", "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion"]

# function(batch: Dict[str, List]) -> Dict[str, List]
def create_target_sentences(batch):
    text_key = 'Report Impression'
    out = {'target': [], text_key: []}
    for i in range(len(batch[text_key])):
        for label in labels:
            if batch[label][i] == 1:
                out['target'].append(f'This example is {label}.')
                out[text_key].append(batch[text_key][i])
    return out
    
dataset_with_labels = dataset.map(
    create_target_sentences,
    batched=True,
    remove_columns=dataset['train'].column_names,
)
dataset_with_labels['train']

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-738683518c3e4325.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4cbccb3c80fc6238.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-fde2fe87704ff66e.arrow


Dataset({
    features: ['Report Impression', 'target'],
    num_rows: 33540
})

In [7]:
dataset_with_labels['train'][0:5]

{'Report Impression': ['1.  Large bilateral layering pleural effusions. 2.  Left mid lung zone nodular opacity could reflect a granuloma or overlying structure. Recommend comparison to prior imaging or follow up.',
  "1.  Extensive cecal wall thickening and inflammatory changes with suspected pneumatosis and evidence of extraluminal mesenteric gas, and trace portal venous gas, in keeping with bowel ischemia. No frank disruption in the bowel contour is seen on noncontrast images. No abscess or drainable fluid collection. 2.  Normal short appendix. 3.  Moderate-sized bilateral pleural effusions with a partially visualized nodular opacity in the right middle lobe, likely representing focal atelectasis. Other less likely etiologies include consolidation or pulmonary nodule, and when the patient's status improves, further assessment with CT chest could be considered. 4.  Compression fracture of L1 with bony retropulsion. This is new from the radiographs of 2/3/2019, but still appears chroni

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

# TODO: max_length may be slow?
def tokenize_function(examples):
    return tokenizer(text=examples["Report Impression"], text_target=examples["target"], padding="max_length", truncation=True)

tokenized_datasets = dataset_with_labels.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset_with_labels['train'].column_names,
)
print_gpu_utilization()

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-92bb9292aa32d176.arrow


Map:   0%|          | 0/9547 [00:00<?, ? examples/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-b926d7310731db40.arrow


GPU memory occupied: 390 MB.


In [9]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 33540
})

In [10]:
tokenizer

BartTokenizerFast(name_or_path='facebook/bart-large-mnli', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [11]:
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
small_val_dataset = tokenized_datasets['val'].shuffle(seed=42).select(range(1000))
print_gpu_utilization()

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-14c4d50f5aefac19.arrow


GPU memory occupied: 390 MB.


In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
print_gpu_utilization()

GPU memory occupied: 390 MB.


In [13]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
print_gpu_utilization()

GPU memory occupied: 393 MB.


In [14]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
print_gpu_utilization()

GPU memory occupied: 393 MB.


In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
print_gpu_utilization()

GPU memory occupied: 393 MB.


In [16]:
from transformers import TrainingArguments, Trainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


# training_args = TrainingArguments(
training_args = Seq2SeqTrainingArguments(
    output_dir="test_trainer_bart",
    evaluation_strategy="epoch",
    logging_steps=100,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    compute_metrics=compute_metrics,
)
print_gpu_utilization()

GPU memory occupied: 2654 MB.


In [17]:
print_gpu_utilization()
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3000
  Number of trainable parameters = 407344131


GPU memory occupied: 2654 MB.


ValueError: Expected input batch_size (1) to match target batch_size (1024).

In [18]:
model

BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((10

In [19]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits
predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]
logits, predicted_class_ids

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

In [20]:
model.config.id2label

{0: 'contradiction', 1: 'neutral', 2: 'entailment'}

In [21]:
model.config._num_labels

3

In [22]:
# does model need to have problem_type="multi_label_classification"?
model.config

BartConfig {
  "_name_or_path": "facebook/bart-large-mnli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "contradiction": 0,
    "entailment": 2,
    "neutral": 1
  },
  "max_position_embeddings": 1024,
  "model_type": "bart",
  

In [23]:
num_labels = len(model.config.id2label)

labels = torch.sum(
    torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
).to(torch.float)
loss = model(**inputs, labels=labels).loss
loss

NameError: name 'predicted_class_ids' is not defined

In [24]:
labels

['Fracture',
 'Edema',
 'Cardiomegaly',
 'Pneumonia',
 'Atelectasis',
 'Pneumothorax',
 'Pleural Effusion']

In [25]:
label = 'cat'
premise = 'I love cats and dogs'
hypothesis = f'This example is {label}.'

# run through model pre-trained on MNLI
device = 0
x = tokenizer.encode(premise, hypothesis, return_tensors='pt', truncation_strategy='only_first')
print_gpu_utilization()
logits = model(x.to(device))[0]
print(logits)
print_gpu_utilization()

GPU memory occupied: 7110 MB.
tensor([[-1.4165,  1.1855,  0.8082]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
GPU memory occupied: 7136 MB.




In [27]:
# loss=None????
out = model(x.to(device))
out

Seq2SeqSequenceClassifierOutput(loss=None, logits=tensor([[-0.5175,  0.8326,  0.3654]], device='cuda:0',
       grad_fn=<AddmmBackward0>), past_key_values=((tensor([[[[ 2.0821e-01,  5.4049e-01,  2.2791e+00,  ...,  1.1593e+00,
            3.3409e+00, -1.5928e+00],
          [ 3.8084e-01,  7.2665e-01,  2.2785e+00,  ...,  1.8345e+00,
            4.2374e+00, -2.4128e+00],
          [-2.4002e+00, -1.6914e+00, -3.1211e+00,  ..., -3.7118e+00,
           -6.7217e+00, -2.6637e+00],
          ...,
          [-4.9207e+00, -2.9609e+00, -1.4663e+00,  ...,  8.8705e-01,
           -9.7217e+00, -5.1363e-01],
          [-6.6214e-01,  2.2178e+00,  1.0365e+00,  ..., -3.8119e+00,
           -7.0802e+00,  9.9498e-01],
          [-7.9141e-01,  1.5570e+00, -3.8589e-01,  ..., -6.2171e+00,
           -7.4372e+00,  1.8077e+00]],

         [[-2.1450e+00, -2.4058e+00, -1.6419e-01,  ..., -5.7289e-01,
            3.6106e-02,  6.3552e-01],
          [-6.4835e-01, -2.3865e+00,  8.2745e-02,  ..., -1.5178e+00,
        

In [28]:
len(out)

3

In [29]:
out[0]

tensor([[-0.5175,  0.8326,  0.3654]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [30]:
out[1]

((tensor([[[[ 2.0821e-01,  5.4049e-01,  2.2791e+00,  ...,  1.1593e+00,
              3.3409e+00, -1.5928e+00],
            [ 3.8084e-01,  7.2665e-01,  2.2785e+00,  ...,  1.8345e+00,
              4.2374e+00, -2.4128e+00],
            [-2.4002e+00, -1.6914e+00, -3.1211e+00,  ..., -3.7118e+00,
             -6.7217e+00, -2.6637e+00],
            ...,
            [-4.9207e+00, -2.9609e+00, -1.4663e+00,  ...,  8.8705e-01,
             -9.7217e+00, -5.1363e-01],
            [-6.6214e-01,  2.2178e+00,  1.0365e+00,  ..., -3.8119e+00,
             -7.0802e+00,  9.9498e-01],
            [-7.9141e-01,  1.5570e+00, -3.8589e-01,  ..., -6.2171e+00,
             -7.4372e+00,  1.8077e+00]],
  
           [[-2.1450e+00, -2.4058e+00, -1.6419e-01,  ..., -5.7289e-01,
              3.6106e-02,  6.3552e-01],
            [-6.4835e-01, -2.3865e+00,  8.2745e-02,  ..., -1.5178e+00,
              4.3460e-01, -1.4588e+00],
            [ 1.1429e+00, -2.4398e+00,  1.0799e+00,  ...,  2.4867e-01,
              1.3355

In [31]:
out[2]

tensor([[[ 5.8969e-03,  2.5570e-02,  1.7614e-02,  ...,  1.5660e-02,
           4.0029e-03, -3.4841e-03],
         [-9.2110e-02, -1.5876e-01, -1.8571e-02,  ...,  1.0956e-01,
           1.9189e-01, -1.6707e-01],
         [-1.0694e-01, -2.1240e-01,  1.6437e-01,  ..., -6.8677e-02,
           1.9139e-01,  1.8703e-02],
         ...,
         [ 8.4653e-02, -4.5941e-01, -5.9310e-01,  ...,  1.1374e-01,
           1.0820e-01, -4.5412e-02],
         [-7.2947e-03,  1.1098e-02,  1.5437e-02,  ...,  1.1579e-02,
           1.8733e-05,  2.2031e-03],
         [ 1.0206e-01,  2.6547e-02,  3.0544e-02,  ..., -2.2488e-02,
          -6.8619e-02,  6.8373e-02]]], device='cuda:0',
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
x

In [None]:
entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)
prob_label_is_true = probs[:,1]
prob_label_is_true

In [None]:
tokenizer(premise, premise, premise)

In [None]:
tokenizer