In [1]:
import numpy as np
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_metric, load_dataset, load_from_disk
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer

In [2]:
train_dataset = load_from_disk('./data/train_dataset_lv1')
valid_dataset = load_from_disk('./data/valid_dataset_lv1')

In [3]:
MODEL = "microsoft/graphcodebert-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(MODEL)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.truncation_side = 'left'

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.o

In [4]:
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

In [5]:
def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

In [6]:
args = TrainingArguments(
    output_dir='./models/',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    disable_tqdm = False,
    do_train=True,
    do_eval=True,
    save_strategy="steps",
    logging_strategy="steps",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=1e-5,
    optim='adamw_torch',
    # metric_for_best_model= "f1",
    save_total_limit=5,
    load_best_model_at_end=True,
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics= metric_fn,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
)

In [7]:
import gc
from knockknock import discord_sender

webhook_url=''

@discord_sender(webhook_url=webhook_url)
def do_train():
    gc.collect()
    torch.cuda.empty_cache()
    trainer.train()

In [8]:
do_train()

***** Running training *****
  Num examples = 300000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 28125


Step,Training Loss,Validation Loss,Accuracy
500,0.3325,0.195409,0.925467
1000,0.1592,0.124913,0.957133
1500,0.1313,0.101645,0.965767
2000,0.1121,0.084731,0.9702
2500,0.1058,0.103482,0.969367
3000,0.0946,0.105472,0.9704
3500,0.0904,0.135164,0.9627
4000,0.0774,0.077462,0.9779
4500,0.0761,0.081016,0.978167
5000,0.0713,0.07327,0.979967


***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Saving model checkpoint to ./models/checkpoint-500
Configuration saved in ./models/checkpoint-500/config.json
Model weights saved in ./models/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./models/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [models/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Saving model checkpoint to ./models/checkpoint-1000
Configuration saved in ./models/checkpoint-1000/config.json
Model weights saved in ./models/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./models/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [models/checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  

KeyboardInterrupt: 

In [None]:
# SUBMISSON

In [None]:
import re
from collections import deque


def preprocess_script(code):
    new_code = deque()

    for line in code.split('\n'):
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','')      # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t')  # 공백 4칸을 tab으로 변환

        if line == '': # 전처리 후 빈 라인은 skip
            continue

        new_code.append(line)

    new_code = '\n'.join(new_code)
    new_code = re.sub('("""[\w\W]*?""")', '<str>', new_code)
    new_code = re.sub("('''[\w\W]*?''')", '<str>', new_code)
    new_code = re.sub('/^(file|gopher|news|nntp|telnet|http?|https?|ftps?|sftp):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/',
                      '<url>',
                      new_code)

    return new_code

In [None]:
MAX_LEN = 512
def example_fn(examples):
    outputs = tokenizer(
        preprocess_script(examples['code1']),
        preprocess_script(examples['code2']),
        padding=True, max_length=MAX_LEN,truncation=True,)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

import pandas as pd

TEST = "./data/test.csv"
SUB = "./data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)["train"]
test_dataset = test_dataset.map(example_fn, remove_columns=["code1", "code2"])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df["similar"] = np.argmax(predictions.predictions, axis=-1)
df.to_csv("./submissions/submission_preprossed.csv", index=False)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()