In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# pip, import

In [2]:
!pip install transformers
!pip install transformers datasets
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 6.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 73.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import copy
from pprint import pprint
from tqdm import tqdm, tqdm_notebook
from collections import defaultdict, Counter, deque
import re
from itertools import chain
from importlib import import_module
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import (Dataset,
                              DataLoader, 
                              RandomSampler, 
                              SequentialSampler, 
                              TensorDataset)
from transformers import TrainingArguments, Trainer
from transformers import (AutoConfig, 
                          AutoTokenizer, 
                          RobertaForSequenceClassification,
                          Trainer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback)
from transformers import AdamW
from transformers import (get_scheduler, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          get_linear_schedule_with_warmup)
from torch.optim.lr_scheduler import ReduceLROnPlateau, _LRScheduler
from tqdm.auto import tqdm
from datasets import load_metric, load_dataset, Dataset, concatenate_datasets
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import (accuracy_score, 
                             precision_recall_curve,
                             f1_score,
                             auc)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.optimizer import Optimizer, required
import math
import easydict
import wandb

# 시드고정

In [4]:
def seed_everything(seed: int = 42, contain_cuda: bool = False):
  os.environ['PYTHONHASHSEED'] = str(seed)
  random.seed(seed)
  np.random.seed(seed)

  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  print(f"Seed set as {seed}")

seed = 42
seed_everything(seed)

Seed set as 42


In [5]:
root_dir = '/content/drive/MyDrive'
project_folder = "DACON"
os.chdir(os.path.join(root_dir,project_folder))

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# wandb로 잘못 분류하는 문장 기록

In [6]:
def wrong_batch_for_wandb(tokenizer,
                          wrong_sample_index,
                          input_ids,
                          valid_labels,
                          valid_predict,
                          valid_output,
                          ):
  num_to_label_dict = {0:'diff', 1:'same',}

  wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
  wrong_sample_text = [tokenizer.decode(element, skip_special_tokens=False) for element in input_ids[wrong_sample_index]]
  wrong_sample_label = [num_to_label_dict[lab] for lab in list(valid_labels[wrong_sample_index])]
  wrong_sample_pred = [num_to_label_dict[pred] for pred in list(valid_predict[wrong_sample_index])]
  wrong_sample_output = valid_output[wrong_sample_index].tolist()

  diff_prob, same_prob = [], []
  for element in wrong_sample_output:
      diff_prob.append(element[0])
      same_prob.append(element[1])

  return wrong_sample_text, wrong_sample_label, wrong_sample_pred, diff_prob, same_prob


# Optimizer

In [7]:
class AdamP(Optimizer):
  def __init__(
      self,
      params,
      lr=1e-3,
      betas=(0.9, 0.999),
      eps=1e-8,
      weight_decay=0,
      delta=0.1,
      wd_ratio=0.1,
      nesterov=False,
      ):
    defaults = dict(
        lr=lr,
        betas=betas,
        eps=eps,
        weight_decay=weight_decay,
        delta=delta,
        wd_ratio=wd_ratio,
        nesterov=nesterov,
        )
    super(AdamP, self).__init__(params, defaults)

  def _channel_view(self, x):
    return x.view(x.size(0), -1)

  def _layer_view(self, x):
    return x.view(1, -1)

  def _cosine_similarity(self, x, y, eps, view_func):
    x = view_func(x)
    y = view_func(y)

    return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()

  def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
    wd = 1
    expand_size = [-1] + [1] * (len(p.shape) - 1)
    for view_func in [self._channel_view, self._layer_view]:
      
      cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
      
      if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
        p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
        perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
        wd = wd_ratio
        return perturb, wd

    return perturb, wd

  def step(self, closure=None):
    loss = None
    if closure is not None:
      loss = closure()

    for group in self.param_groups:
      for p in group["params"]:
        if p.grad is None:
          continue

        grad = p.grad.data
        beta1, beta2 = group["betas"]
        nesterov = group["nesterov"]

        state = self.state[p]

        # State initialization
        if len(state) == 0:
          state["step"] = 0
          state["exp_avg"] = torch.zeros_like(p.data)
          state["exp_avg_sq"] = torch.zeros_like(p.data)

        # Adam
        exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]

        state["step"] += 1
        bias_correction1 = 1 - beta1 ** state["step"]
        bias_correction2 = 1 - beta2 ** state["step"]

        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

        denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(
            group["eps"]
            )
        step_size = group["lr"] / bias_correction1

        if nesterov:
          perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
        else:
          perturb = exp_avg / denom

        # Projection
        wd_ratio = 1
        if len(p.shape) > 1:
          perturb, wd_ratio = self._projection(
              p,
              grad,
              perturb,
              group["delta"],
              group["wd_ratio"],
              group["eps"],
              )

          # Weight decay
        if group["weight_decay"] > 0:
          p.data.mul_(1 - group["lr"] * group["weight_decay"] * wd_ratio)

          # Step
        p.data.add_(perturb, alpha=-step_size)

    return loss

def get_optimizer(model, args):
  if args.optimizer == "Adam":
    optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
  elif args.optimizer == "AdamW":
    optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)
  elif args.optimizer == "AdamP":
    optimizer = AdamP(
        model.parameters(),
        lr=args.lr,
        betas=(0.9, 0.999),
        weight_decay=0.01,
        delta=0.1,
        wd_ratio=0.1,
        nesterov=False,
        )
  else:
    raise NotImplementedError('Optimizer not available')

  # 모든 parameter들의 grad값을 0으로 초기화
  optimizer.zero_grad()

  return optimizer

# Scheduler

In [8]:
class CosineAnnealingWarmupRestarts(_LRScheduler):
  """
    optimizer (Optimizer): Wrapped optimizer.
    first_cycle_steps (int): First cycle step size.
    cycle_mult(float): Cycle steps magnification. Default: -1.
    max_lr(float): First cycle's max learning rate. Default: 0.1.
    min_lr(float): Min learning rate. Default: 0.001.
    warmup_steps(int): Linear warmup step size. Default: 0.
    gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
    last_epoch (int): The index of last epoch. Default: -1.
  """
  def __init__(self,
               optimizer : torch.optim.Optimizer,
               first_cycle_steps : int,
               cycle_mult : float = 1.,
               max_lr : float = 0.1,
               min_lr : float = 0.001,
               warmup_steps : int = 0,
               gamma : float = 1.,
               last_epoch : int = -1
               ):
    assert warmup_steps < first_cycle_steps
        
    self.first_cycle_steps = first_cycle_steps # first cycle step size
    self.cycle_mult = cycle_mult # cycle steps magnification
    self.base_max_lr = max_lr # first max learning rate
    self.max_lr = max_lr # max learning rate in the current cycle
    self.min_lr = min_lr # min learning rate
    self.warmup_steps = warmup_steps # warmup step size
    self.gamma = gamma # decrease rate of max learning rate by cycle
    
    self.cur_cycle_steps = first_cycle_steps # first cycle step size
    self.cycle = 0 # cycle count
    self.step_in_cycle = last_epoch # step size of the current cycle
    
    super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)
        
    # set learning rate min_lr
    self.init_lr()
    
  def init_lr(self):
    self.base_lrs = []
    for param_group in self.optimizer.param_groups:
      param_group['lr'] = self.min_lr
      self.base_lrs.append(self.min_lr)
    
  def get_lr(self):
    if self.step_in_cycle == -1:
      return self.base_lrs
    elif self.step_in_cycle < self.warmup_steps:
      return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
    else:
      return [base_lr + (self.max_lr - base_lr) \
              * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \
                              / (self.cur_cycle_steps - self.warmup_steps))) / 2
              for base_lr in self.base_lrs]

  def step(self, epoch=None):
    if epoch is None:
      epoch = self.last_epoch + 1
      self.step_in_cycle = self.step_in_cycle + 1
      if self.step_in_cycle >= self.cur_cycle_steps:
        self.cycle += 1
        self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
        self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
      else:
        if epoch >= self.first_cycle_steps:
          if self.cycle_mult == 1.:
            self.step_in_cycle = epoch % self.first_cycle_steps
            self.cycle = epoch // self.first_cycle_steps
          else:
            n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
            self.cycle = n
            self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
            self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
        else:
          self.cur_cycle_steps = self.first_cycle_steps
          self.step_in_cycle = epoch
                
        self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr


def get_scheduler(optimizer, args, total_batch_):
  if args.scheduler == "plateau":
      scheduler = ReduceLROnPlateau(
          optimizer, patience=2, factor=0.85, mode="max", verbose=True
      )
  elif args.scheduler == "linear":
      scheduler = get_linear_schedule_with_warmup(
          optimizer,
          # num_warmup_steps=int(total_batch_*args.epochs*0.1),
          num_warmup_steps=args.warmup_steps,
          num_training_steps=int(total_batch_*args.epochs),
      )
  elif args.scheduler == "cosine":
      scheduler = CosineAnnealingWarmupRestarts( # ver1: first_cycle=20, warmup_steps=5, cycle_mult=1.0, max_lr=args.lr, min_lr=args.lr/100, gamma=0.8, patience=7, 
          optimizer,                             # ver2: first_cycle=30, warmup_steps=5, cycle_mult=0.8, max_lr=args.lr, min_lr=args.lr/100, gamma=0.8, patience=5
          first_cycle_steps=200,                  # ver3: first_cycle=50, warmup_steps=10, cycle_mult=1.0, max_lr=args.lr, min_lr=args.lr/100, gamma=0.8, patience=7
          warmup_steps=args.warmup_steps,
          cycle_mult=args.cycle_mult,
          max_lr=args.lr,
          min_lr=args.lr * 0.01,
          gamma=0.9,
      )
  else:
    raise NotImplementedError('LR Scheduler not available')

  return scheduler


# Loss

In [9]:
class FocalLoss(nn.Module):
  def __init__(self, weight=None,
               gamma=2., reduction='mean'):
    nn.Module.__init__(self)
    self.weight = weight
    self.gamma = gamma
    self.reduction = reduction

  def forward(self, input_tensor, target_tensor):
    log_prob = F.log_softmax(input_tensor, dim=-1)
    prob = torch.exp(log_prob)
    return F.nll_loss(
        ((1 - prob) ** self.gamma) * log_prob,
        target_tensor,
        weight=self.weight,
        reduction=self.reduction
        )

class LabelSmoothingLoss(nn.Module):
  def __init__(self, classes=3, smoothing=0.0, dim=-1):
    super(LabelSmoothingLoss, self).__init__()
    self.confidence = 1.0 - smoothing
    self.smoothing = smoothing
    self.cls = classes
    self.dim = dim

  def forward(self, pred, target):
    pred = pred.log_softmax(dim=self.dim)
    with torch.no_grad():
      true_dist = torch.zeros_like(pred)
      true_dist.fill_(self.smoothing / (self.cls - 1))
      true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
    return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))


def get_criterion(args):
  if args.smoothing!=0 and args.criterion == 'smoothing':
    criterion = LabelSmoothingLoss(smoothing=args.smoothing)
  elif args.criterion == 'cross':
    criterion = nn.CrossEntropyLoss()
  elif args.criterion == 'focal':
    criterion = FocalLoss(gamma=2.0)
  else:
    raise NotImplementedError('Criterion not available')
  return criterion

# Tokenize

In [10]:
tdataset = load_dataset("csv", data_files='/content/drive/MyDrive/train_data_lv1.csv')['train']
vdataset = load_dataset("csv", data_files='/content/drive/MyDrive/valid_data_lv1.csv')['train']
rawdataset = concatenate_datasets([tdataset, vdataset])
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding='max_length', max_length=512, truncation=True)
    outputs['labels'] = examples['similar']
    return outputs

dataset = rawdataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])

Using custom data configuration default-e7859c9c4620c0f4


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e7859c9c4620c0f4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e7859c9c4620c0f4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-cf13ab19521ea58f


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-cf13ab19521ea58f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-cf13ab19521ea58f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

  0%|          | 0/660000 [00:00<?, ?ex/s]

# Arguments 설정

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'current device : {device}')

args = easydict.EasyDict({
    "seed":42,
    "optimizer":"AdamW",    # help = (AdamW, Adam, AdamP)
    "scheduler":"linear",     # help= (linear, cosine, plateau ...)
    "warmup_steps":500,
    "cycle_mult":1.2,
    "batch_size": 16,
    "patience":5,
    "n_splits":5,
    "epochs":3,
    "lr": 2e-05,
    "criterion":'cross', # 'smoothing','focal','cross'
    "smoothing": 0.0,
    "model": "microsoft/graphcodebert-base",
    "logging_wrong_samples":True,
    })

project_name = "graphcodebert_Bs16_OptAdamW_ScduCosine_Sm0.0"
args.update(
            {
                "project_name":project_name,
                "model_name":project_name,
             }
            )

seed_everything(args.seed)

current device : cuda:0
Seed set as 42


# Train

In [12]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [13]:
criterion = get_criterion(args)
config =  AutoConfig.from_pretrained("microsoft/graphcodebert-base")
config.num_labels = 2
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", config=config)
model.to(device)

best_val_acc_list = []

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_

In [14]:
gap = int(len(dataset) / args.n_splits)

In [15]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# 2 fold

In [None]:
f = 2

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

[34m[1mwandb[0m: Currently logged in as: [33mnahyeonkang[0m. Use [1m`wandb login --relogin`[0m to force relogin


---------------------------------- 2 fold----------------------------------


## 1 epoch

In [None]:
f = 2
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

------------------------------ 2 fold 1 epoch------------------------------



  0%|          | 0/33000 [00:00<?, ?it/s]

Epoch 1 #50 -- loss: 0.6944755041599273, acc: 0.48375
Epoch 1 #100 -- loss: 0.6875697326660156, acc: 0.53875
Epoch 1 #150 -- loss: 0.6842502653598785, acc: 0.54625
Epoch 1 #200 -- loss: 0.6417299628257751, acc: 0.64
Epoch 1 #250 -- loss: 0.49696703881025317, acc: 0.765
Epoch 1 #300 -- loss: 0.37552170783281325, acc: 0.8375
Epoch 1 #350 -- loss: 0.2988986775279045, acc: 0.88375
Epoch 1 #400 -- loss: 0.2359264947474003, acc: 0.90625
Epoch 1 #450 -- loss: 0.2389705354720354, acc: 0.91125
Epoch 1 #500 -- loss: 0.2591427024081349, acc: 0.895
Epoch 1 #550 -- loss: 0.24544144466519355, acc: 0.885
Epoch 1 #600 -- loss: 0.16852444712072612, acc: 0.93125
Epoch 1 #650 -- loss: 0.19004087373614312, acc: 0.93125
Epoch 1 #700 -- loss: 0.19206940002739428, acc: 0.92625
Epoch 1 #750 -- loss: 0.1746066513285041, acc: 0.93125
Epoch 1 #800 -- loss: 0.15200876073911787, acc: 0.94625
Epoch 1 #850 -- loss: 0.16907683610916138, acc: 0.93625
Epoch 1 #900 -- loss: 0.19010195594280957, acc: 0.9275
Epoch 1 #950 

In [None]:
f = 2
e = 1
best_val_loss, best_val_acc, = np.inf, 0
###### Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# 틀린 데이터들을 wandb 기록하기 위함.
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      ###########################
      # valid eval 결과, 틀린 데이터들은 wandb에 Logging
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob
      ###########################

      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)
  
###### Model save
val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    # 참고 : Model 추가 재학습을 위한 모델을 저장하는 코드
    # https://tutorials.pytorch.kr/beginner/saving_loading_models.html#checkpoint

    best_val_acc = val_total_acc

    ### Confusion Matrix
    class_names = ['diff','same'] # (0,1,2)
    # https://wandb.ai/wandb/plots/reports/Confusion-Matrix--VmlldzozMDg1NTM
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:
      ########### Logging Wrong Samples ##########
      # Save Wrong DataFrame
      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      # Loggin Wandb
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
      ###########################
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

New best model for val accuracy : 0.9936969696969697! saving the best model..

>>>> Validation loss: 0.019624535606069856, Acc: 0.9936969696969697

2fold best_val_acc_list : [0.9936969696969697]


# 3 fold

In [None]:
f = 3

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

[34m[1mwandb[0m: Currently logged in as: [33mnahyeonkang[0m. Use [1m`wandb login --relogin`[0m to force relogin


---------------------------------- 3 fold----------------------------------


## 1 epoch

In [None]:
f = 3
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

------------------------------ 3 fold 1 epoch------------------------------



  0%|          | 0/33000 [00:00<?, ?it/s]

Epoch 1 #50 -- loss: 0.6944829165935517, acc: 0.485
Epoch 1 #100 -- loss: 0.6867254137992859, acc: 0.54875
Epoch 1 #150 -- loss: 0.682221246957779, acc: 0.55625
Epoch 1 #200 -- loss: 0.6456893861293793, acc: 0.63625
Epoch 1 #250 -- loss: 0.4734108594059944, acc: 0.78
Epoch 1 #300 -- loss: 0.3881120966374874, acc: 0.8375
Epoch 1 #350 -- loss: 0.3164524355530739, acc: 0.86375
Epoch 1 #400 -- loss: 0.23843293450772762, acc: 0.9
Epoch 1 #450 -- loss: 0.254080480709672, acc: 0.89625
Epoch 1 #500 -- loss: 0.22996790029108524, acc: 0.905
Epoch 1 #550 -- loss: 0.21558539021760226, acc: 0.9175
Epoch 1 #600 -- loss: 0.1737480340152979, acc: 0.93375
Epoch 1 #650 -- loss: 0.19372726561501621, acc: 0.92125
Epoch 1 #700 -- loss: 0.2150927236676216, acc: 0.91625
Epoch 1 #750 -- loss: 0.17217799790203572, acc: 0.9325
Epoch 1 #800 -- loss: 0.14376555467024446, acc: 0.945
Epoch 1 #850 -- loss: 0.19577322551980614, acc: 0.91875
Epoch 1 #900 -- loss: 0.14963770128786563, acc: 0.9425
Epoch 1 #950 -- loss: 

In [None]:
f = 3
e = 1
best_val_loss, best_val_acc, = np.inf, 0
###### Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# 틀린 데이터들을 wandb 기록하기 위함.
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      ###########################
      # valid eval 결과, 틀린 데이터들은 wandb에 Logging
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob
      ###########################

      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)
  
###### Model save
val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    # 참고 : Model 추가 재학습을 위한 모델을 저장하는 코드
    # https://tutorials.pytorch.kr/beginner/saving_loading_models.html#checkpoint

    best_val_acc = val_total_acc

    ### Confusion Matrix
    class_names = ['diff','same'] # (0,1,2)
    # https://wandb.ai/wandb/plots/reports/Confusion-Matrix--VmlldzozMDg1NTM
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:
      ########### Logging Wrong Samples ##########
      # Save Wrong DataFrame
      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      # Loggin Wandb
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
      ###########################
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

New best model for val accuracy : 0.9934772727272727! saving the best model..

>>>> Validation loss: 0.019584552417655658, Acc: 0.9934772727272727

3fold best_val_acc_list : [0.9934772727272727]


# 4 fold

In [None]:
f = 4

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

---------------------------------- 4 fold----------------------------------


VBox(children=(Label(value='1.349 MB of 2.696 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.500360…

0,1
Last_Valid Acc,▁
Last_Valid Loss,▁
Total Mean ACC (3fold),▁
Train epoch Acc,▁
Train epoch Loss,▁
epoch,▁▁
learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
Last_Valid Acc,0.99348
Last_Valid Loss,0.01958
Total Mean ACC (3fold),0.99348
Train epoch Acc,0.97883
Train epoch Loss,0.05525
epoch,1.0
learning_rate,1e-05


## 1 epoch

In [None]:
f = 4
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

------------------------------ 4 fold 1 epoch------------------------------



  0%|          | 0/33000 [00:00<?, ?it/s]

Epoch 1 #50 -- loss: 0.02072806987504009, acc: 0.99375
Epoch 1 #100 -- loss: 0.00713290082901949, acc: 0.995
Epoch 1 #150 -- loss: 0.012821453771030064, acc: 0.99875
Epoch 1 #200 -- loss: 0.013446899642294738, acc: 0.99625
Epoch 1 #250 -- loss: 0.028035393965401455, acc: 0.99125
Epoch 1 #300 -- loss: 0.02047721901733894, acc: 0.99
Epoch 1 #350 -- loss: 0.021598954788350964, acc: 0.9975
Epoch 1 #400 -- loss: 0.02425752628099872, acc: 0.9925
Epoch 1 #450 -- loss: 0.007799174248211784, acc: 0.99625
Epoch 1 #500 -- loss: 0.011446078079170547, acc: 0.9975
Epoch 1 #550 -- loss: 0.03521750810556114, acc: 0.98875
Epoch 1 #600 -- loss: 0.011926271446282045, acc: 0.99625
Epoch 1 #650 -- loss: 0.02152895774808712, acc: 0.99625
Epoch 1 #700 -- loss: 0.036120347534306346, acc: 0.99
Epoch 1 #750 -- loss: 0.008400429477915168, acc: 0.99875
Epoch 1 #800 -- loss: 0.026048845781479032, acc: 0.99375
Epoch 1 #850 -- loss: 0.02314829966693651, acc: 0.99
Epoch 1 #900 -- loss: 0.02232085084397113, acc: 0.992

In [None]:
f = 4
e = 1
best_val_loss, best_val_acc, = np.inf, 0
###### Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# 틀린 데이터들을 wandb 기록하기 위함.
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      ###########################
      # valid eval 결과, 틀린 데이터들은 wandb에 Logging
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob
      ###########################

      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)
  
###### Model save
val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    # 참고 : Model 추가 재학습을 위한 모델을 저장하는 코드
    # https://tutorials.pytorch.kr/beginner/saving_loading_models.html#checkpoint

    best_val_acc = val_total_acc

    ### Confusion Matrix
    class_names = ['diff','same'] # (0,1,2)
    # https://wandb.ai/wandb/plots/reports/Confusion-Matrix--VmlldzozMDg1NTM
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:
      ########### Logging Wrong Samples ##########
      # Save Wrong DataFrame
      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      # Loggin Wandb
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
      ###########################
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

New best model for val accuracy : 0.9961212121212121! saving the best model..

>>>> Validation loss: 0.01259141871224953, Acc: 0.9961212121212121

4fold best_val_acc_list : [0.9934772727272727, 0.9961212121212121]


# 5 fold

In [16]:
f = 5

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

[34m[1mwandb[0m: Currently logged in as: [33mnahyeonkang[0m. Use [1m`wandb login --relogin`[0m to force relogin


---------------------------------- 5 fold----------------------------------


## 1 epoch

In [None]:
f = 5
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

------------------------------ 5 fold 1 epoch------------------------------



  0%|          | 0/33000 [00:00<?, ?it/s]

Epoch 1 #50 -- loss: 0.6967503786087036, acc: 0.495
Epoch 1 #100 -- loss: 0.6932619571685791, acc: 0.51875
Epoch 1 #150 -- loss: 0.6782697117328644, acc: 0.59375
Epoch 1 #200 -- loss: 0.6539136481285095, acc: 0.59375
Epoch 1 #250 -- loss: 0.45253886699676515, acc: 0.7975
Epoch 1 #300 -- loss: 0.3676878722012043, acc: 0.83375
Epoch 1 #350 -- loss: 0.29423546597361566, acc: 0.88375
Epoch 1 #400 -- loss: 0.2576303370296955, acc: 0.8925
Epoch 1 #450 -- loss: 0.25329060837626455, acc: 0.89
Epoch 1 #500 -- loss: 0.21139218602329493, acc: 0.91
Epoch 1 #550 -- loss: 0.19943900473415851, acc: 0.915
Epoch 1 #600 -- loss: 0.14964510945603252, acc: 0.94
Epoch 1 #650 -- loss: 0.20588469017297029, acc: 0.91625
Epoch 1 #700 -- loss: 0.17614690754562617, acc: 0.92875
Epoch 1 #750 -- loss: 0.19389984548091888, acc: 0.9225
Epoch 1 #800 -- loss: 0.1566071960143745, acc: 0.93875
Epoch 1 #850 -- loss: 0.1790329297631979, acc: 0.92625
Epoch 1 #900 -- loss: 0.14826076440513133, acc: 0.94
Epoch 1 #950 -- loss

In [17]:
f = 5
e = 1
best_val_loss, best_val_acc, = np.inf, 0
###### Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# 틀린 데이터들을 wandb 기록하기 위함.
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      ###########################
      # valid eval 결과, 틀린 데이터들은 wandb에 Logging
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob
      ###########################

      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)
  
###### Model save
val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    # 참고 : Model 추가 재학습을 위한 모델을 저장하는 코드
    # https://tutorials.pytorch.kr/beginner/saving_loading_models.html#checkpoint

    best_val_acc = val_total_acc

    ### Confusion Matrix
    class_names = ['diff','same'] # (0,1,2)
    # https://wandb.ai/wandb/plots/reports/Confusion-Matrix--VmlldzozMDg1NTM
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:
      ########### Logging Wrong Samples ##########
      # Save Wrong DataFrame
      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      # Loggin Wandb
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
      ###########################
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

New best model for val accuracy : 0.9919469696969697! saving the best model..

>>>> Validation loss: 0.025486182730782965, Acc: 0.9919469696969697

5fold best_val_acc_list : [0.9919469696969697]


# Inference

In [18]:
def preprocess_script(code):
    
    codea = code['code1']
    codeb = code['code2']

    new_codea = deque()
    for line in codea.split('\n'):
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','')      # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t')  # 공백 4칸을 tab으로 변환

        if line == '': # 전처리 후 빈 라인은 skip
            continue

        new_codea.append(line)

    new_codea = '\n'.join(new_codea)
    new_codea = re.sub('("""[\w\W]*?""")', '<str>', new_codea)
    new_codea = re.sub("('''[\w\W]*?''')", '<str>', new_codea)
    new_codea = re.sub('/^(file|gopher|news|nntp|telnet|http?|https?|ftps?|sftp):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/',
                      '<url>',
                      new_codea)
    code['code1'] = new_codea

    new_codeb = deque()   
    for line in codeb.split('\n'):
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','')      # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t')  # 공백 4칸을 tab으로 변환

        if line == '': # 전처리 후 빈 라인은 skip
            continue

        new_codeb.append(line)

    new_codeb = '\n'.join(new_codeb)
    new_codeb = re.sub('("""[\w\W]*?""")', '<str>', new_codeb)
    new_codeb = re.sub("('''[\w\W]*?''')", '<str>', new_codeb)
    new_codeb = re.sub('/^(file|gopher|news|nntp|telnet|http?|https?|ftps?|sftp):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/',
                      '<url>',
                      new_codeb)
    
    code['code2'] = new_codeb
    return code


def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding='max_length', max_length=512, truncation=True)
    return outputs

In [19]:
testdataset = load_dataset("csv", data_files='/content/drive/MyDrive/test.csv')['train']
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")

preprocessed = testdataset.map(preprocess_script)
test_dataset = preprocessed.map(example_fn, remove_columns=['code1', 'code2'])
collator = DataCollatorWithPadding(tokenizer=tokenizer)

testloader = DataLoader(test_dataset,
                          batch_size=16,
                          shuffle=False,
                         collate_fn = collator
                          )

all_fold_logits = np.zeros((179700, 2))  # rows of df, target labels
for idx in tqdm(range(1, args.n_splits+1)):
  load_path = f'./models/{args.model_name}/{idx}-fold/best.pt'
  model.load_state_dict(torch.load(load_path,map_location=torch.device('cpu')))
  model.to(device)
  model.eval()
  progress_bar = tqdm(enumerate(testloader), total=len(testloader), leave=True, position=0,)
  for i, data in progress_bar:
    with torch.no_grad():
      logits = model(
                  data['input_ids'].to(device),
                  data['attention_mask'].to(device),
                  )
      logits=logits.logits
    if i==0:
      one_fold_logits = logits
    else:
      one_fold_logits = torch.cat([one_fold_logits,logits],dim=0) # (batchsize,3) + (batchsize,3) -> (batchsize+batchsize,3)

  # torch tensor를 저장하기 위한 numpy 변환
  one_fold_logits = one_fold_logits.squeeze(0).detach().cpu().numpy()
  # numpy array 저장
  np.save(f'./models/{args.model_name}/{idx}-fold/numpy_logits', one_fold_logits)
  # np_load = np.load(f'./models/{args.model_name}/{idx}-fold/numpy_logits.npy')
  all_fold_logits += one_fold_logits
  if idx == 1:
    all_fold_predictions = np.argmax(one_fold_logits, axis=1)
  else:
    one_fold_predictions = np.argmax(one_fold_logits, axis=1)
    all_fold_predictions = np.vstack([all_fold_predictions, one_fold_predictions])

soft_output = list(np.argmax(all_fold_logits, axis=1))
hard_output = ([max(list(Counter(lst).items()), key=lambda x:x[1])[0] for lst in all_fold_predictions.T])


Using custom data configuration default-13b174ecdaaff536


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-13b174ecdaaff536/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-13b174ecdaaff536/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_

  0%|          | 0/179700 [00:00<?, ?ex/s]

  0%|          | 0/179700 [00:00<?, ?ex/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/11232 [00:00<?, ?it/s]

  0%|          | 0/11232 [00:00<?, ?it/s]

  0%|          | 0/11232 [00:00<?, ?it/s]

  0%|          | 0/11232 [00:00<?, ?it/s]

  0%|          | 0/11232 [00:00<?, ?it/s]

In [20]:
submission_path = "/content/drive/MyDrive/sample_submission.csv"

submissionsoft = pd.read_csv(submission_path)
submissionhard = pd.read_csv(submission_path)

In [21]:
submissionsoft['similar']=soft_output
submissionhard['similar']=hard_output
submissionsoft.to_csv('/content/drive/MyDrive/submissionsoft.csv', index=False)
submissionhard.to_csv('/content/drive/MyDrive/submissionhard.csv', index=False)