In [10]:
1 + 2

3

In [11]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()
    
print_gpu_utilization()

import os
os.environ['DISABLE_MLFLOW_INTEGRATION'] = 'TRUE'

GPU memory occupied: 436 MB.


In [12]:
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
from datasets import Dataset
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm
print_gpu_utilization()

GPU memory occupied: 436 MB.


In [13]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
print_gpu_utilization()
model.config.label2id

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

GPU memory occupied: 436 MB.


{'contradiction': 0, 'entailment': 2, 'neutral': 1}

In [14]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={
    "train": "/root/data/chex_train.csv",
    "val": "/root/data/chex_val.csv",
    "test": "/root/data/chex_test.csv",
})
print_gpu_utilization()
dataset

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-dcb44e19e0ca611f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/3 [00:00<?, ?it/s]

GPU memory occupied: 436 MB.


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Report Impression', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices', 'No Finding'],
        num_rows: 102304
    })
    val: Dataset({
        features: ['Unnamed: 0', 'Report Impression', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices', 'No Finding'],
        num_rows: 29230
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Report Impression', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'Support Devices', 'No Finding'],
        num

In [15]:
dataset['train'][3] # contains Atelectasis, Pleural Effusion, and Fracture

{'Unnamed: 0': 8231,
 'Report Impression': "1.  Extensive cecal wall thickening and inflammatory changes with suspected pneumatosis and evidence of extraluminal mesenteric gas, and trace portal venous gas, in keeping with bowel ischemia. No frank disruption in the bowel contour is seen on noncontrast images. No abscess or drainable fluid collection. 2.  Normal short appendix. 3.  Moderate-sized bilateral pleural effusions with a partially visualized nodular opacity in the right middle lobe, likely representing focal atelectasis. Other less likely etiologies include consolidation or pulmonary nodule, and when the patient's status improves, further assessment with CT chest could be considered. 4.  Compression fracture of L1 with bony retropulsion. This is new from the radiographs of 2/3/2019, but still appears chronic. Correlation with point tenderness recommended. Dr. Li discussed these findings with Dr. Cohen via telephone on 9/19/2020 at 4:10 AM..",
 'Enlarged Cardiomediastinum': None

In [16]:
dataset['train'][3]['Fracture'] == 1

True

In [17]:
labels = ["Fracture", "Edema", "Cardiomegaly", "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion"]

# function(batch: Dict[str, List]) -> Dict[str, List]
def create_target_sentences(batch):
    text_key = 'Report Impression'
    out = {'target': [], text_key: [], 'labels': []}
    for i in range(len(batch[text_key])):
        for label in labels:
            out['target'].append(f'This example is {label}.')
            out[text_key].append(batch[text_key][i])
            if batch[label][i] == -1:
                out['labels'].append(model.config.label2id['contradiction'])
            elif batch[label][i] == None or batch[label][i] == 0:
                out['labels'].append(model.config.label2id['neutral'])
            elif batch[label][i] == 1:
                out['labels'].append(model.config.label2id['entailment'])
            else:
                raise Exception(f"invalid value in labels {batch[label][i]}")
    return out
    
dataset_with_labels = dataset.map(
    create_target_sentences,
    batched=True,
    remove_columns=dataset['train'].column_names,
)
dataset_with_labels['train']

Map:   0%|          | 0/102304 [00:00<?, ? examples/s]

Map:   0%|          | 0/29230 [00:00<?, ? examples/s]

Map:   0%|          | 0/14615 [00:00<?, ? examples/s]

Dataset({
    features: ['Report Impression', 'target', 'labels'],
    num_rows: 716128
})

In [18]:
set(dataset_with_labels['train']['labels'][:50])

{0, 1, 2}

In [19]:
remove_columns = dataset_with_labels['train'].column_names
remove_columns.remove('labels') # keep the labels column!
remove_columns

['Report Impression', 'target']

In [20]:
# TODO: max_length may be slow?
def tokenize_function(examples):
    return tokenizer(text=examples["Report Impression"], text_pair=examples["target"], padding="max_length", truncation='only_first')

tokenized_datasets = dataset_with_labels.map(
    tokenize_function,
    batched=True,
    remove_columns=remove_columns,
)
tokenized_datasets

Map:   0%|          | 0/716128 [00:00<?, ? examples/s]

Map:   0%|          | 0/204610 [00:00<?, ? examples/s]

Map:   0%|          | 0/102305 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 716128
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 204610
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 102305
    })
})

In [22]:
tokenized_datasets['train'][100]

{'labels': 1,
 'input_ids': [0,
  134,
  4,
  1437,
  440,
  13827,
  1886,
  28119,
  922,
  43462,
  2199,
  4,
  2,
  2,
  713,
  1246,
  16,
  5866,
  118,
  4399,
  571,
  7776,
  4,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  

In [21]:
tokenized_datasets.save_to_disk("/root/data/bart_fine_tune")

Saving the dataset (0/8 shards):   0%|          | 0/716128 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/204610 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/102305 [00:00<?, ? examples/s]

END

In [11]:


# small_train_dataset = dataset_with_labels['train'].shuffle(seed=42).select(range(5)).map(
small_train_dataset = dataset_with_labels['train'].shuffle(seed=42).select(range(1000)).map(
    tokenize_function,
    batched=True,
    remove_columns=remove_columns,
)

# small_val_dataset = dataset_with_labels['val'].shuffle(seed=42).select(range(4)).map(
small_val_dataset = dataset_with_labels['val'].shuffle(seed=42).select(range(200)).map(
    tokenize_function,
    batched=True,
    remove_columns=remove_columns,
)

print_gpu_utilization()
small_train_dataset

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-9e75d602e447abcd.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-b5380fdb8f3ab1e2.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-18795d90473e17a5.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-50c8ec55ede3b71e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-8ed8aaec53942f0a.arrow


GPU memory occupied: 390 MB.


Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [13]:
tokenizer

BartTokenizerFast(name_or_path='facebook/bart-large-mnli', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [14]:
# small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(10))
# small_val_dataset = tokenized_datasets['val'].shuffle(seed=42).select(range(10))
# print_gpu_utilization()

In [15]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits_tuple, labels = eval_pred
#     print(labels.shape, labels)
#     print(type(logits_tuple), logits_tuple[0].shape, logits_tuple[1].shape)
#     print(logits_tuple)
    logits, _ = logits_tuple
    predictions = np.argmax(logits, axis=-1)
#     print(predictions.shape)
    return metric.compute(predictions=predictions, references=labels)

print_gpu_utilization()

GPU memory occupied: 393 MB.


In [16]:
from transformers import TrainingArguments, Trainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = TrainingArguments(
# training_args = Seq2SeqTrainingArguments(
    output_dir="test_trainer_bart",
    evaluation_strategy="epoch",
    logging_steps=100,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
)

trainer = Trainer(
# trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    compute_metrics=compute_metrics,
)
print_gpu_utilization()

GPU memory occupied: 2654 MB.


In [None]:
print_gpu_utilization()
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3000
  Number of trainable parameters = 407344131


GPU memory occupied: 2654 MB.


Epoch,Training Loss,Validation Loss


Saving model checkpoint to test_trainer_bart/checkpoint-500
Configuration saved in test_trainer_bart/checkpoint-500/config.json
Model weights saved in test_trainer_bart/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer_bart/checkpoint-1000
Configuration saved in test_trainer_bart/checkpoint-1000/config.json
Model weights saved in test_trainer_bart/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 200
  Batch size = 1


In [None]:
model

In [None]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits
predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]
logits, predicted_class_ids

In [None]:
model.config.id2label

In [None]:
model.config._num_labels

In [None]:
# does model need to have problem_type="multi_label_classification"?
model.config

In [None]:
num_labels = len(model.config.id2label)

labels = torch.sum(
    torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
).to(torch.float)
loss = model(**inputs, labels=labels).loss
loss

In [None]:
labels

In [None]:
# from https://huggingface.co/joeddav/bart-large-mnli-yahoo-answers
label = 'cat'
premise = 'I love cats and dogs'
hypothesis = f'This example is {label}.'

# run through model pre-trained on MNLI
device = 0
x = tokenizer.encode(premise, hypothesis, return_tensors='pt', truncation_strategy='only_first')
print_gpu_utilization()
logits = model(x.to(device))[0]
print(logits)
print_gpu_utilization()

In [None]:
x

In [None]:
y = tokenizer.encode(premise, return_tensors='pt', truncation_strategy='only_first')
y

In [None]:
# loss=None????
out = model(x.to(device))
out

In [None]:
len(out)

In [None]:
out[0]

In [None]:
out[1]

In [None]:
out[2]

In [None]:
x

In [None]:
entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)
prob_label_is_true = probs[:,1]
prob_label_is_true

In [None]:
tokenizer(premise, premise, premise)

In [None]:
tokenizer