In [3]:
!pip install pynvml

[0m

In [4]:
import gc
import os
from pynvml import *

import torch
import pandas as pd
import numpy as np

from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

from tqdm import tqdm
from feedback_custom_funtions import loss_fn, optimizer_setup, FeedBackDataset, RMSELoss, compute_metrics
from model_building import MeanPooling, MaxPooling, MinPooling, AttentionPooling, FeedBackModel


# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["WANDB_DISABLED"] = "true"

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")
    
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [None]:
config = {"seed": 42,
          "epochs": 1,
          "debug" : True,
          "model_name": "microsoft/deberta-v3-large",
          "PoolingLayer": AttentionPooling(1024),
          "group" : "deberta-v3-Large-AP-LLRD" ,
          "loss_type": "smooth_l1", # ['mse', 'rmse', 'smooth_l1']
          "train_batch_size": 4,
          "valid_batch_size": 8,
          "fp16_enable" : True,
          "max_length": 512,
          "layerwise" : True,
          "learning_rate": 1e-5,
          "decoder_lr": 1e-4,
          "weight_decay": 1e-6,
          "n_fold": 4,
          "n_accumulate": 4,
          "max_grad_norm": 1000,
          "num_classes": 6,
          "target_cols": ["cohesion", "syntax", "vocabulary", 
                          "phraseology", "grammar", "conventions"],
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "competition": "FeedBack3",
          "_wandb_kernel": "hazrul",
          "check_model_gpu_usage": False
          }

In [None]:
if config["check_model_gpu_usage"]: 
    config_model = AutoConfig.from_pretrained("microsoft/deberta-xlarge")
    model = FeedBackModel("microsoft/deberta-xlarge", 6, AttentionPooling(config_model.hidden_size)).to(config["device"])
    print_gpu_utilization()

    del model
    torch.cuda.empty_cache()
    gc.collect()

FeedBackModel Architecture GPU usage

1. Large : GPU memory occupied: 2321 MB
2. Base : GPU memory occupied: 1403 MB
3. Small : GPU memory occupied: 1223 MB

In [None]:
df = pd.read_csv("/kaggle/input/feedbackprizemultilabelstratifiedkfold/kfold_train_FB_comptetion.csv")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
config["tokenizer"] = tokenizer
collate_fn = DataCollatorWithPadding(tokenizer=config['tokenizer'])

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        loss = loss_fn(outputs.logits, inputs['target'], loss_type=config['loss_type'])
        return (loss, outputs) if return_outputs else loss

In [None]:
df_train = df[df.kfold != 1].reset_index(drop=True)
df_valid = df[df.kfold == 1].reset_index(drop=True)

train_dataset = FeedBackDataset(df_train, tokenizer=config['tokenizer'], max_length=config['max_length'], target_label = config["target_cols"])
valid_dataset = FeedBackDataset(df_valid, tokenizer=config['tokenizer'], max_length=config['max_length'], target_label = config["target_cols"])

model = FeedBackModel(config['model_name'], config["num_classes"], PoolingLayer = config["PoolingLayer"]).to(config['device'])

# Define Optimizer and Scheduler
optimizer, scheduler = optimizer_setup(model=model, 
                                       config=config, 
                                       train_dataset_size =len(train_dataset),
                                       layerwise = config["layerwise"]
                                      )

training_args = TrainingArguments(
    output_dir=f"outputs",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=config['train_batch_size'],
    per_device_eval_batch_size=config['valid_batch_size'],
    num_train_epochs= config['epochs'],
    learning_rate= config['learning_rate'],
    weight_decay= config['weight_decay'],
    gradient_accumulation_steps=config['n_accumulate'],
    max_grad_norm=config['max_grad_norm'],
    seed=config['seed'],
    fp16  = config["fp16_enable"],
    fp16_full_eval  =config["fp16_enable"],
    group_by_length = True,
    metric_for_best_model= 'eval_mcrmse',
    load_best_model_at_end=True,
    greater_is_better=False,
    save_strategy="epoch",
    save_total_limit=1,
    report_to = None, #"wandb",
    label_names = ["target"]
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collate_fn,
    optimizers=(optimizer, scheduler),
    compute_metrics=compute_metrics
)

result = trainer.train()
print_summary(result)

del model, train_dataset, valid_dataset
torch.cuda.empty_cache()
gc.collect()

1. Training & eval batch size : 2, gradient accumulation 8, fp16 True, total batch size 16
    - Time: 1008.12
    - Samples/second: 2.91
    - GPU memory occupied: 11591 MB.
    
2. Training batch size: 4,  eval batch size : 3, gradient accumulation 8, fp16 True, total batch size 32
    - Time: 829.84
    - Samples/second: 3.54
    - GPU memory occupied: 14853 MB.
    - Score >0.5
    
3. Training batch size 8, eval batch size 8, gradient accumulation 2, fp16 True, total batch size 16
    - Out of CUDA
    
4. Training batch size: 4,  eval batch size : 4, gradient accumulation 4, fp16 True, total batch size 24
    - Time: 826.07
    - Samples/second: 3.55
    - GPU memory occupied: 14853 MB.
    - Score: 0.48364236942074873
5. Training 4, eval 8 gradient acumulation 4, fp16 True, total batch size 24

    - Time: 819.41
    - Samples/second: 3.58
    - GPU memory occupied: 14851 MB.
    - 0.4805801246195733
    
## Base
1. Trainin 8 Eval 8 GA 4 batch size 32
    - Time: 242.02
    - Samples/second: 12.12
    - GPU memory occupied: 10173 MB.
    
2. Trainin 8 Eval 16 GA 4 batch size 32
    - Time: 237.08
    - Samples/second: 12.37
    - GPU memory occupied: 10941 MB.
    - 0.5631028740872916

In [11]:
config = dict(
    seed = 42,
    num_models = 3,
    model_name_0 = '../input/debertav3base',
    model_name_1 = "../input/deberta-v3-large/deberta-v3-large",
    model_name_2 = "../input/roberta-base",
    model_name = "microsoft/deberta-v3-large",
    test_batch_size = 64,
    max_length = 512,
    num_classes = 6,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    pooling_1 = AttentionPooling(1024),
)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
config["tokenizer"] = tokenizer
collate_fn = DataCollatorWithPadding(tokenizer=config['tokenizer'])

loading configuration file https://huggingface.co/microsoft/deberta-v3-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f5d66efa509542e643c08a1579633e747d1697b1bec7de32c51c6969a16e81b9.3554ddad32be74b53d95a4b5760f07a2cd799268a921ae9437b1ee7a47adebc9
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_

In [7]:
df = pd.read_csv("/kaggle/input/feedbackprizemultilabelstratifiedkfold/kfold_train_FB_comptetion.csv")
test_df = df[df.kfold != 1].reset_index(drop=True)

In [14]:
model = FeedBackModel(config[f"model_name"], config["num_classes"], config[f"pooling_1"])
model.to(config['device'])           

collate_fn = DataCollatorWithPadding(tokenizer=config[f'tokenizer'])
test_dataset = FeedBackDataset(df=test_df, 
                               tokenizer=config[f'tokenizer'], 
                               max_length =config["max_length"], 
                               train_mode=False)


training_args = TrainingArguments(
        output_dir=".",
        per_device_eval_batch_size=config['test_batch_size'],
        label_names=["target"]
    )

trainer = Trainer(model=model,
                  args=training_args,
                  data_collator=collate_fn)

predictions = trainer.predict(test_dataset)                                                                       
print_gpu_utilization()

del model, test_dataset
torch.cuda.empty_cache()
gc.collect()

loading configuration file https://huggingface.co/microsoft/deberta-v3-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f5d66efa509542e643c08a1579633e747d1697b1bec7de32c51c6969a16e81b9.3554ddad32be74b53d95a4b5760f07a2cd799268a921ae9437b1ee7a47adebc9
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_

GPU memory occupied: 13607 MB.


825

In [17]:
predictions.metrics

{'test_runtime': 228.6626,
 'test_samples_per_second': 12.822,
 'test_steps_per_second': 0.201}

32 - GPU memory occupied: 7471 MB.
64 - GPU memory occupied: 13607 MB
{'test_runtime': 228.6626,
 'test_samples_per_second': 12.822,
 'test_steps_per_second': 0.201}