#### make datasets

In [6]:
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric

In [7]:
MODEL = "microsoft/graphcodebert-base"
TRAIN_LV1_INPUT = "./data/train_data_lv1.csv"
VALID_LV1_INPUT = "./data/valid_data_lv1.csv"
MAX_LEN = 512

train_dataset_lv1 = load_dataset("csv", data_files=TRAIN_LV1_INPUT)['train']
valid_dataset_lv1 = load_dataset("csv", data_files=VALID_LV1_INPUT)["train"]
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.truncation_side='left'

Using custom data configuration default-9107b79668af0ad1
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-9107b79668af0ad1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-17a36eab66b2c9d1
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-17a36eab66b2c9d1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
import re
from collections import deque


def preprocess_script(code):
    new_code = deque()

    for line in code.split('\n'):
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','')      # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t')  # 공백 4칸을 tab으로 변환

        if line == '': # 전처리 후 빈 라인은 skip
            continue

        new_code.append(line)

    new_code = '\n'.join(new_code)
    new_code = re.sub('("""[\w\W]*?""")', '<str>', new_code)
    new_code = re.sub("('''[\w\W]*?''')", '<str>', new_code)
    new_code = re.sub('/^(file|gopher|news|nntp|telnet|http?|https?|ftps?|sftp):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/',
                      '<url>',
                      new_code)

    return new_code

In [9]:
def example_fn(examples):
    outputs = tokenizer(
        preprocess_script(examples['code1']),
        preprocess_script(examples['code2']),
        padding=True, max_length=MAX_LEN,truncation=True,)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

train_dataset_lv1 = train_dataset_lv1.map(example_fn, remove_columns=['code1', 'code2', 'similar'])
valid_dataset_lv1 = valid_dataset_lv1.map(example_fn, remove_columns=['code1', 'code2', 'similar'])

  0%|          | 0/300000 [00:00<?, ?ex/s]

  0%|          | 0/30000 [00:00<?, ?ex/s]

In [10]:
train_dataset_lv1.save_to_disk('./data/train_dataset_lv1')
valid_dataset_lv1.save_to_disk('./data/valid_dataset_lv1')