## settings


In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import re
import time
import glob
import json
import pickle
import random
import shutil
import tarfile
import requests
from tqdm import tqdm
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

rd = '/content/drive/MyDrive/Example/'
if not os.path.exists(rd):
  os.makedirs(rd)

%cd $rd

In [3]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [4]:
# make 'data' folder (if not exists)
data_dir = './data/'
if not os.path.exists(data_dir):
  print(f'Making new folder: {data_dir}')
  os.mkdir(data_dir)


# load IMDb dataset
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
file_name = 'aclImdb_v1.tar.gz'
src_path = '/content/extracted_files'
path_dict = {
    'train_pos': src_path + '/aclImdb/train/pos',
    'train_neg': src_path + '/aclImdb/train/neg',
    'test_pos': src_path + '/aclImdb/test/pos',
    'test_neg': src_path + '/aclImdb/test/neg'
}
dfs = []

if not (os.path.isfile(data_dir + 'IMDb_train.tsv') and os.path.isfile(data_dir + 'IMDb_test.tsv')):
  print('Downloading...')

  try:
    response = requests.get(url)
    response.raise_for_status()
    with open(file_name, 'wb') as f:
      f.write(response.content)
  except requests.exceptions.RequestException as e:
    print(f'Failed to download {file_name}: {e}')

  print('Extracting...')
  with tarfile.open(file_name, 'r:gz') as tar:
    tar.extractall(path=src_path)

  for k, v in path_dict.items():
    print(f'Concatenating: {k}')
    _dfs = []

    for fname in tqdm(glob.glob(v + '/*.txt')):
      with open(fname, 'r', encoding='utf-8') as f:
        content = f.read()
      label = 1 if 'pos' in fname else 0
      df = pd.DataFrame({'label': [label], 'text': [content]})
      _dfs.append(df)

    combined_df = pd.concat(_dfs, axis=0, ignore_index=True)
    dfs.append(combined_df)

  print('Converting to tsv: train')
  df_train = pd.concat(dfs[:2], axis=0, ignore_index=True)
  df_train.to_csv(data_dir + 'IMDb_train.tsv', sep='\t', index=False)

  print('Converting to tsv: test')
  df_test = pd.concat(dfs[2:], axis=0, ignore_index=True)
  df_test.to_csv(data_dir + 'IMDb_test.tsv', sep='\t', index=False)

  print('Cleaning up...')
  shutil.rmtree(src_path)
  os.remove(file_name)

  print('Done!')

else:
  print('File already exists')


File already exists


In [5]:
class CFG:
  SEED = 42
  NUM_LABELS = 2
  MAX_LENGTH = 512
  EPOCHS = 3
  MAX_LR = 2e-5
  NUM_WARMUP_STEPS = 128
  DROPOUT = 0.05
  MODEL_NAME = 'microsoft/deberta-v3-base'
  DO_VAL = True
  DO_TRAIN = True
  DO_PREDICT = True
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  DEVICE_NAME = ''
  BATCH_SIZE = 16
  if torch.cuda.is_available():
    DEVICE_NAME = torch.cuda.get_device_name(0)
  else:
    DEVICE_NAME = 'CPU'



In [6]:
# seed
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = True


set_seed(CFG.SEED)

In [7]:
# checkpoint
def save_checkpoint(model, optimizer, epoch, save_path):
  checkpoint = {
      'model': model.state_dict(),
      'optimizer': optimizer.state_dict(),
      'epoch': epoch
  }
  torch.save(checkpoint, save_path)


def load_checkpoint(model, optimizer, load_path):
  checkpoint = torch.load(load_path)
  model.load_state_dict(checkpoint['model'])
  optimizer.load_state_dict(checkpoint['optimizer'])
  epoch = checkpoint['epoch']
  return model, optimizer, epoch

## preprocessing

In [8]:
pd.set_option('display.max_colwidth', None)

In [9]:
train_path = rd + '/data/IMDb_train.tsv'
test_path =rd + '/data/IMDb_test.tsv'

df_tr = pd.read_csv(train_path, sep='\t')
df_ts = pd.read_csv(test_path, sep='\t')


In [10]:
def textPreprocessor(text):
  text = re.sub(r'[^.,a-zA-Z0-9\s]', ' ', text)
  text = re.sub(r'<\s*br\s*/?\s*>', '', text)
  text = text.replace('.', ' . ').replace(',', ' , ')
  return text


df_tr['text'] = df_tr['text'].apply(textPreprocessor)
df_ts['text'] = df_tr['text'].apply(textPreprocessor)

# extract some data for predicting
num_shuffle = 20
shuffle_cnt = 0

while shuffle_cnt < num_shuffle:
  df_ts = df_ts.sample(frac=1, random_state=CFG.SEED).reset_index(drop=True)
  shuffle_cnt += 1

df_ts = df_ts.head(512)
df_ts.label.value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,257
1,255


In [None]:
df_ts['text'].iloc[0]

## tokenize

In [12]:
out_dir = './outputs/'
check_dir = out_dir + 'checkpoint/'

if not os.path.exists(out_dir):
  os.makedirs(out_dir)

if not os.path.exists(check_dir):
  os.makedirs(check_dir)

In [13]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)

# tokenizer settings
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.add_special_tokens({'eos_token': tokenizer.eos_token})

# save tokenizer
tokenizer.save_pretrained(out_dir + 'tokenizer')

('./outputs/tokenizer/tokenizer_config.json',
 './outputs/tokenizer/special_tokens_map.json',
 './outputs/tokenizer/spm.model',
 './outputs/tokenizer/added_tokens.json',
 './outputs/tokenizer/tokenizer.json')

## model settings

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(CFG.MODEL_NAME,
                                                                num_labels=CFG.NUM_LABELS)

model.config.pretraining_pt = 1
model.pad_token_id = tokenizer.pad_token_id

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# we train all the parameters of model by default. Set layers to train depending on your training environment.
# it takes about 1 hour to fine-tune our base_model with L4-GPU.

def set_trainable_layers(device_name):
  if 'Tesla T4' in device_name:
    CFG.BATCH_SIZE = 8
    for param in model.base_model.parameters():
      param.requires_grad = False

    for param in model.classifier.parameters():
      param.requires_grad = True

    for param in model.pooler.parameters():
      param.requires_grad = True

    for param in model.deberta.encoder.layer[-1].parameters():
      param.requires_grad = True

    for param in model.deberta.encoder.layer[-2].parameters():
      param.requires_grad = True

    for param in model.deberta.encoder.layer[-3].parameters():
      param.requires_grad = True
    print(f'connecting to: {device_name}\nbatch_size: {CFG.BATCH_SIZE}\nsome layers were set to trainable.')
    print('If you encounter memory usage errors or other errors during training or prediction, please manually change the batch size or number of layers to train as needed.')

  elif 'NVIDIA L4' in device_name:
    for param in model.base_model.parameters():
      param.requires_grad = True
    print(f'connecting to: {device_name}\nbatch_size: {CFG.BATCH_SIZE}\nall layers were set to trainable.')

  elif 'NVIDIA A100' in device_name:
    CFG.BATCH_SIZE = 32
    for param in model.base_model.parameters():
      param.requires_grad = True

  elif 'CPU' in device_name:
    _device = xm.xla_device()
    if 'xla' in str(_device):
      print('Warning: connecting to TPU runtime but our code is not optimized for xla device.')
      CFG.DEVICE = xm.xla_device()
      CFG.BATCH_SIZE = 16
      for param in model.base_model.parameters():
        param.requires_grad = False

      for param in model.classifier.parameters():
        param.requires_grad = True

      for param in model.pooler.parameters():
        param.requires_grad = True

      for param in model.deberta.encoder.layer[-1].parameters():
        param.requires_grad = True

      for param in model.deberta.encoder.layer[-2].parameters():
        param.requires_grad = True

      for param in model.deberta.encoder.layer[-3].parameters():
        param.requires_grad = True
      print(f'connecting to: TPU\nbatch_size: {CFG.BATCH_SIZE}\nsome layers were set to trainable.')
      print('If you encounter memory usage errors or other errors during training or prediction, please manually change the batch size or number of layers to train as needed.')

    else:
     raise RuntimeError(f'Unsupported device type: {device_name}. Please connect to a supported GPU Runtime or configure for your specific GPU.')

  else:
    print('Warning: Using a CUDA-enabled GPU, but not one of the pre-configured types. Assuming compatibility and proceeding.')
    for param in model.base_model.parameters():
        param.requires_grad = True
    print(f'connecting to: {device_name}\nbatch_size: {CFG.BATCH_SIZE}\nall layers were set to trainable.')
    print('If you encounter memory usage errors or other errors during training or prediction, please manually change the batch size or number of layers to train as needed.')

  total_params = sum(p.numel() for p in model.parameters())
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

  print('-' * 50)
  print(f"Total parameters: {total_params}")
  print(f"Trainable parameters: {trainable_params}")
  print(f"Ratio: {(trainable_params / total_params) * 100}%")



set_trainable_layers(CFG.DEVICE_NAME)

## dataset, dataloader

In [17]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    if idx >= len(self.labels):
      raise IndexError(f"Index {idx} is out of bounds for labels with length {len(self.labels)}")
    text = self.texts[idx]
    label = self.labels[idx]
    token = self.tokenizer(text,
                            padding='max_length',
                            max_length=self.max_length,
                            truncation=True,
                            return_tensors='pt')

    return token, text, label

In [None]:
# DATA SPLIT
if CFG.DO_VAL:
  tr_txt,val_txt, tr_lbl, val_lbl = train_test_split(df_tr['text'].tolist(), df_tr['label'].tolist(), test_size=0.05, random_state=CFG.SEED)
  print(len(tr_txt))
  print(len(val_txt))
  print(len(tr_lbl))
  print(len(val_lbl))

  # dataset
  tr_ds = CustomDataset(tr_txt, tr_lbl, tokenizer, CFG.MAX_LENGTH)
  val_ds = CustomDataset(val_txt, val_lbl, tokenizer, CFG.MAX_LENGTH)
  print(f"Training dataset size: {len(tr_ds)}")
  print(f"Validation dataset size: {len(val_ds)}")

  # dataloader
  tr_dl = DataLoader(tr_ds, batch_size=CFG.BATCH_SIZE, shuffle=True)
  val_dl = DataLoader(val_ds, batch_size=CFG.BATCH_SIZE, shuffle=False)
  print(f"Training dataloader size: {len(tr_dl)}")
  print(f"Validation dataloader size: {len(val_dl)}")

  dl_dict = {
      'train': tr_dl,
      'val': val_dl
  }

  print(f'Training dataset size: {len(tr_ds)}')
  print(f'Validation dataset size: {len(val_ds)}')

else:
  # dataset
  tr_ds = CustomDataset(df_tr['text'], df_tr['label'], tokenizer, CFG.MAX_LENGTH)

  # dataloader
  tr_dl = DataLoader(tr_ds, batch_size=CFG.BATCH_SIZE, shuffle=True)

  dl_dict = {
      'train': tr_dl
  }

  print(f'Training dataset size: {len(tr_ds)}')

## settings for training

In [20]:
# optimzier and lr scheduler
N_SAMPLES = len(tr_ds)
STEPS_PER_EPOCH = N_SAMPLES // CFG.BATCH_SIZE
TOTAL_STEPS = CFG.EPOCHS * STEPS_PER_EPOCH

optimizer = optim.AdamW(model.parameters(), lr=CFG.MAX_LR)

lr_scheduler = transformers.get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=CFG.NUM_WARMUP_STEPS,
    num_training_steps=TOTAL_STEPS
)

print(f'batch_size: {CFG.BATCH_SIZE}\nSTEPS_PER_EPOCH: {STEPS_PER_EPOCH}\nTOTAL_STEPS: {TOTAL_STEPS}')

batch_size: 16
STEPS_PER_EPOCH: 1484
TOTAL_STEPS: 4452


In [21]:
# criterion
def criterion(logits, labels):
  return nn.BCEWithLogitsLoss()(logits, labels)

In [22]:
def trainer(model, dl_dict, criterion, optimizer, lr_scheduler, num_epochs):
  device = CFG.DEVICE
  compute_device = 'GPU' if device.type == 'cuda' else 'CPU'
  print(f'This is {compute_device} trainer!!')
  gc.collect()
  torch.cuda.empty_cache()

  scaler = GradScaler()
  history = defaultdict(list)

  start = time.time()
  model.to(device)
  for epoch in range(1, num_epochs+1):
    tr_dataset_size = 0
    tr_running_loss = 0.0
    tr_running_acc = 0.0
    val_dataset_size = 0
    val_running_loss = 0.0
    val_running_acc = 0.0

    iteration = 0

    # traine phase
    model.train()
    for batch in tqdm(dl_dict['train'], desc=f'Epoch:{epoch}/{num_epochs} | Phase:Train'):
      tokens, _, labels = batch
      input_ids = tokens['input_ids'].squeeze(1).to(CFG.DEVICE)
      attn_masks = tokens['attention_mask'].squeeze(1).to(CFG.DEVICE)
      labels = labels.to(CFG.DEVICE).unsqueeze(1)
      optimizer.zero_grad(set_to_none=True)

      with autocast():
          outputs = model(input_ids=input_ids, attention_mask=attn_masks).logits
          outputs = outputs[:, 1].unsqueeze(1)
          loss = criterion(outputs, labels.float())
          _, preds = torch.max(outputs, 1)

      scaler.scale(loss).backward()
      scaler.step(optimizer)
      scaler.update()
      lr_scheduler.step()

      tr_acc = (torch.sum(preds == labels.squeeze().detach())).double() / input_ids.size(0)
      tr_running_loss += loss.item() * input_ids.size(0)
      tr_running_acc += tr_acc * input_ids.size(0)
      tr_dataset_size += input_ids.size(0)
      iteration += 1

      if (iteration % 100) == 0:
          print(f'Phase:Train | Iteration:{iteration} | Loss:{loss.item()} | Accuracy:{tr_acc.item()}')
          print(f'Correct Predictions: {torch.sum(preds == labels.squeeze().detach()).double()}')
          print(f'Batch Size: {input_ids.size(0)}')

      if (iteration % 500) == 0:
        print(f'\nIteration:{iteration}')
        save_path = f'{check_dir}epoch_{epoch}_iter_{iteration}.pth'
        save_checkpoint(model, optimizer, epoch, save_path)
        print(f'\nModel saved to: {save_path}')

    if CFG.DO_VAL:
      model.eval()
      for batch in tqdm(dl_dict['val'], desc=f'Epoch:{epoch} | Phase:Validate'):
        tokens, _, labels = batch
        input_ids = tokens['input_ids'].squeeze(1).to(CFG.DEVICE)
        attn_masks = tokens['attention_mask'].squeeze(1).to(CFG.DEVICE)
        labels = labels.to(CFG.DEVICE).unsqueeze(1)

        with torch.inference_mode():
          with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attn_masks).logits
            outputs = outputs[:, 1].unsqueeze(1)
            loss = criterion(outputs, labels.float())
            _, preds = torch.max(outputs, 1)

        val_acc = (torch.sum(preds == labels.squeeze().detach())).double() / input_ids.size(0)
        val_running_loss += loss.item() * input_ids.size(0)
        val_running_acc += val_acc * input_ids.size(0)
        val_dataset_size += input_ids.size(0)

    # histoy
    tr_epoch_loss = tr_running_loss / tr_dataset_size
    tr_epoch_acc = tr_running_acc / tr_dataset_size
    history['train_loss'].append(tr_epoch_loss)
    history['train_auroc'].append(tr_epoch_acc)
    print(f'Epoch:{epoch}/{num_epochs} | Train Loss:{tr_epoch_loss} | Train ACC:{tr_epoch_acc}')

    if CFG.DO_VAL:
      val_epoch_loss = val_running_loss / val_dataset_size
      val_epoch_acc = val_running_acc / val_dataset_size
      history['val_loss'].append(val_epoch_loss)
      history['val_auroc'].append(val_epoch_acc)
      print(f'Epoch:{epoch}/{num_epochs} | Val Loss:{val_epoch_loss} | Val ACC:{val_epoch_acc}')

    gc.collect()
    torch.cuda.empty_cache()

  end = time.time()
  time_elapsed = end - start
  print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
      time_elapsed // 3600,
      (time_elapsed % 3600) // 60,
      (time_elapsed % 3600) % 60
  ))

  return history, model


## train

In [None]:
# train
if CFG.DO_TRAIN:
  history, trained_model = trainer(model, dl_dict, criterion, optimizer, lr_scheduler, CFG.EPOCHS)

In [24]:
if CFG.DO_TRAIN:
  torch.save(trained_model.state_dict(), out_dir + 'trained_model.pth')

In [25]:
torch.cuda.empty_cache()
gc.collect()

0

## predict

In [26]:
if CFG.DO_PREDICT:
  batch_size = 4
  tokenizer = AutoTokenizer.from_pretrained(out_dir + 'tokenizer')
  model = AutoModelForSequenceClassification.from_pretrained(CFG.MODEL_NAME, num_labels=CFG.NUM_LABELS)

  model.config.pretraining_pt = 1
  model.pad_token_id = tokenizer.pad_token_id

  model.load_state_dict(torch.load(out_dir + 'trained_model.pth'))
  model.to(CFG.DEVICE)
  model.eval()

  ts_ds = CustomDataset(df_ts['text'], df_ts['label'], tokenizer, CFG.MAX_LENGTH)
  ts_dl = DataLoader(ts_ds, batch_size=batch_size, shuffle=False, pin_memory=True)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for batch in ts_dl:
  tokens, texts, labels = batch
  print(tokens)
  print(texts)
  break

In [31]:
if CFG.DO_PREDICT:
  results = []
  model.to(CFG.DEVICE)

  acc = 0.0
  dataset_size = 0
  for batch in tqdm(ts_dl, desc='Phase:Predict'):
    tokens, texts, labels = batch
    input_ids = tokens['input_ids'].squeeze(1).to(CFG.DEVICE)
    attn_masks = tokens['attention_mask'].squeeze(1).to(CFG.DEVICE)
    labels = labels.to(CFG.DEVICE)

    with torch.inference_mode():
      with autocast():
        outputs = model(input_ids=input_ids, attention_mask=attn_masks, output_attentions=True)
        logits = outputs.logits
        attention = outputs.attentions[-1]

      logits, preds = torch.max(logits, 1)
      acc += (torch.sum(preds == labels.detach())).double()
      dataset_size += input_ids.size(0)

      batch_size, num_heads, seq_len, _ = attention.shape
      all_attn = torch.zeros((batch_size, CFG.MAX_LENGTH), device=attention.device)

      for i in range(num_heads):
        all_attn += attention[:, i, 0, :]
      all_attn /= 12
      all_attn /= all_attn.max(dim=1, keepdim=True)[0]

      probs = F.sigmoid(logits).cpu().numpy()
      preds = preds.cpu().numpy()

      decoded_texts = [tokenizer.decode(ids, skip_special_tokens=False) for ids in input_ids]

      for i in range(len(labels)):
        results.append({
            'pred': preds[i],
            'prob': probs[i],
            'label': labels[i].item(),
            'text': decoded_texts[i],
            'attention': all_attn[i].cpu().numpy()
        })

    del input_ids, attn_masks, labels, outputs, logits, attention, probs, preds
    torch.cuda.empty_cache()
    gc.collect()

  print(f'Test data size: {dataset_size}')
  print(f'Test accuracy: {(acc / dataset_size) * 100}')


Phase:Predict: 100%|██████████| 128/128 [00:27<00:00,  4.70it/s]

Test data size: 512
Test accuracy: 99.21875





In [32]:
for i in range(len(results)):
  text = results[i]['text']
  text = re.sub(r'\[SEP\]', '', text)
  text = re.findall(r'\[CLS\]|\w+|[.,]', text)
  text.append('[SEP]')
  results[i]['text'] = text

for i in range(len(results)):
  attns = results[i]['attention']
  attns = attns[attns != 0.0]
  results[i]['attention'] = attns

In [37]:
def make_serializable(results):
    serializable_results = []
    for item in tqdm(results):
        serializable_item = {
            'pred': int(item['pred']),
            'prob': float(item['prob']),
            'label': int(item['label']),
            'text': item['text'],
            'attention': item['attention'].tolist()
        }
        serializable_results.append(serializable_item)
    return serializable_results

if CFG.DO_PREDICT:
  results = make_serializable(results)

  with open(out_dir + 'results.json', 'w') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

100%|██████████| 512/512 [00:00<00:00, 77269.85it/s]
