In [1]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()
    
print_gpu_utilization()

import os
os.environ['DISABLE_MLFLOW_INTEGRATION'] = 'TRUE'

GPU memory occupied: 436 MB.


In [2]:
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
import datasets
from datasets import Dataset
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm
print_gpu_utilization()

GPU memory occupied: 436 MB.


In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
print_gpu_utilization()
model.config.label2id

GPU memory occupied: 436 MB.


{'contradiction': 0, 'entailment': 2, 'neutral': 1}

In [4]:
dataset = datasets.load_from_disk("/root/data/bart_fine_tune")
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 716128
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 204610
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 102305
    })
})

In [5]:
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(10000))
small_val_dataset = dataset['val'].shuffle(seed=42).select(range(1000))

print_gpu_utilization()
small_train_dataset

Loading cached shuffled indices for dataset at /root/data/bart_fine_tune/train/cache-0024ad8d45ef04ef.arrow
Loading cached shuffled indices for dataset at /root/data/bart_fine_tune/val/cache-b8c1ca28e970e932.arrow


GPU memory occupied: 436 MB.


Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 10000
})

In [6]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits_tuple, labels = eval_pred
    logits, _ = logits_tuple
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

print_gpu_utilization()

GPU memory occupied: 436 MB.


In [7]:
from transformers import TrainingArguments, Trainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = TrainingArguments(
    output_dir="test_trainer_bart",
    evaluation_strategy="epoch",
    num_train_epochs=1,
    logging_steps=10,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    gradient_accumulation_steps=10, # effective batch size is per_device_train_batch_size * gradient_accumulation_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    compute_metrics=compute_metrics,
)
print_gpu_utilization()

GPU memory occupied: 2773 MB.


In [8]:
print_gpu_utilization()
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 1
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 30
  Gradient Accumulation steps = 10
  Total optimization steps = 333
  Number of trainable parameters = 407344131


GPU memory occupied: 2773 MB.


Epoch,Training Loss,Validation Loss,Accuracy
0,0.0403,0.041433,0.99


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 3


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=333, training_loss=0.13549290288676014, metrics={'train_runtime': 2510.7965, 'train_samples_per_second': 3.983, 'train_steps_per_second': 0.133, 'total_flos': 2.171398665074688e+16, 'train_loss': 0.13549290288676014, 'epoch': 1.0})

In [11]:
# print_gpu_utilization()
# from numba import cuda
# device = cuda.get_current_device()
# device.reset()
# print_gpu_utilization()

GPU memory occupied: 21439 MB.
GPU memory occupied: 438 MB.


In [11]:
model_dir = "/root/models/bart_10k_3_11_23"
model.save_pretrained(model_dir)

Configuration saved in /root/models/bart_10k_3_11_23/config.json
Model weights saved in /root/models/bart_10k_3_11_23/pytorch_model.bin


In [7]:
from transformers import TrainingArguments, Trainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = TrainingArguments(
    output_dir="test_trainer_bart",
    evaluation_strategy="epoch",
    num_train_epochs=1,
    logging_steps=3,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    gradient_accumulation_steps=10, # effective batch size is per_device_train_batch_size * gradient_accumulation_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
    compute_metrics=compute_metrics,
)
print_gpu_utilization()
trainer.train()
print_gpu_utilization()

***** Running training *****
  Num examples = 10000
  Num Epochs = 1
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 30
  Gradient Accumulation steps = 10
  Total optimization steps = 333
  Number of trainable parameters = 407344131


GPU memory occupied: 2773 MB.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [8]:
trainer.state.log_history

[{'loss': 0.5972,
  'learning_rate': 4.954954954954955e-05,
  'epoch': 0.01,
  'step': 3},
 {'loss': 0.2903,
  'learning_rate': 4.90990990990991e-05,
  'epoch': 0.02,
  'step': 6},
 {'loss': 0.2196,
  'learning_rate': 4.8648648648648654e-05,
  'epoch': 0.03,
  'step': 9},
 {'loss': 0.5475,
  'learning_rate': 4.8198198198198205e-05,
  'epoch': 0.04,
  'step': 12}]

In [10]:
import json
json_object = json.dumps(trainer.state.log_history, indent=4)

with open(model_dir + "/log_history.json", "w") as outfile:
    outfile.write(json_object)