In [None]:
!pip install transformers
!pip install transformers datasets
!pip install easydict

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from tqdm import tqdm
import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (Dataset,
                              DataLoader, 
                              RandomSampler, 
                              SequentialSampler, 
                              TensorDataset)
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import (AutoConfig, 
                          AutoTokenizer, 
                          RobertaForSequenceClassification,
                          Trainer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback)
from transformers import AdamW
from transformers import (get_scheduler, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          get_linear_schedule_with_warmup)
from torch.optim.lr_scheduler import ReduceLROnPlateau, _LRScheduler
from tqdm.auto import tqdm
from datasets import load_metric, load_dataset, Dataset, concatenate_datasets
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import (accuracy_score, 
                             precision_recall_curve,
                             f1_score,
                             auc)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.optimizer import Optimizer, required
import math
import easydict

In [None]:
def seed_everything(seed):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)
  random.seed(seed)

seed_everything(42)

In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)

  label_indices = list(range(3))
  f1 = f1_score(labels, preds, average="micro", labels=label_indices) * 100.0
  return {'micro f1 score': f1}

In [None]:
# k-fold를 위해 나누어져있는 dataset을 다시 합쳤습니다.
train_dset = load_dataset("csv", data_files="./train_data_lv1.csv")['train']
validation_dset = load_dataset("csv", data_files="./valid_data_lv1.csv")['train']
rawdataset = concatenate_datasets([train_dset, validation_dset])

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'

# Tokenize
def example_fn(examples):
  outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=512, truncation=True)
  if 'similar' in examples:
      outputs["labels"] = examples["similar"]
  return outputs
dset = rawdataset.map(example_fn, remove_columns=train_dset.column_names)

In [None]:
def train(args):

  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
  config =  AutoConfig.from_pretrained("microsoft/graphcodebert-base")
  config.num_labels = 2

   
  gap = int(len(dset) / args.k_fold)

  for i in range(args.k_fold):
        
        model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", config=config).to(device)
        
        print('\n%dth Training' %(i+1))
        
        output_dir = args.output_dir + '_' + str(i+1)
        logging_dir = args.logging_dir + '_' + str(i+1)
        
        # trainingset, validset 구성
        total_size = len(dset)
        total_ids = list(range(total_size))
        del_ids = list(range(i*gap, (i+1)*gap))
        training_ids = set(total_ids) - set(del_ids)
        
        training_dset = dset.select(list(training_ids))
        eval_dset = dset.select(del_ids)

        # Training Arguments -> Graphcodebert 깃허브를 참고하여 설정했습니다.
        args.max_steps=args.epochs*len(dset)
        args.save_steps=len(dset)//10
        args.warmup_steps = args.max_steps//5
        

        training_args = TrainingArguments(
          output_dir=args.output_dir,                         # output directory
          overwrite_output_dir=True,                          # overwrite output directory
          save_total_limit=5,                                 # number of total save model.
          save_steps=args.save_steps,                         # model saving step.
          num_train_epochs=args.epochs,                       # total number of training epochs
          learning_rate=args.lr,                              # learning_rate
          per_device_train_batch_size=args.train_batch_size,  # batch size per device during training
          per_device_eval_batch_size=args.eval_batch_size,    # batch size for evaluation
          warmup_steps=args.warmup_steps,                     # number of warmup steps for learning rate scheduler
          weight_decay=args.weight_decay,                     # strength of weight decay
          logging_dir=args.logging_dir,                       # directory for storing logs
          logging_steps=args.logging_steps,                   # log saving step.
          evaluation_strategy=args.evaluation_strategy,       # evaluation strategy to adopt during training
          eval_steps=args.eval_steps,                         # evaluation step.
          load_best_model_at_end = True, # for earlystopping
          save_strategy = 'steps', # for earlystopping
          logging_strategy = 'steps', # for earlystopping
          gradient_accumulation_steps=args.gradient_accumulation_steps,
        )

        collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=512)

        trainer = Trainer(
          model=model,                         # the instantiated Transformers model to be trained
          args=training_args,                  # training arguments, defined above
          train_dataset=training_dset,            # training dataset
          eval_dataset=eval_dset,        # evaluation dataset
          data_collator=collator,              # collator
          compute_metrics=compute_metrics,      # define metrics function -> micro f1
          callbacks = [EarlyStoppingCallback(early_stopping_patience=10)],
        )

        # -- Training
        print('Training Strats')
        trainer.train()

In [None]:
args = easydict.EasyDict({
    'output_dir': './DACON',
    'logging_dir': './DACON',
    'lr': 2e-5,
    'epochs': 3,
    'train_batch_size': 4,
    'weight_decay': 0.0,
    'warmup_steps': 0,
    'gradient_accumulation_steps':2,
    'eval_batch_size': 8,
    'k_fold':5,
    'evaluation_strategy': 'steps',
    'save_steps': 1000,
    'logging_steps': 1000,
    'eval_steps':1000,
    'max_steps':-1
    })

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

train(args)