## Import

In [1]:
import time
import re
import os
import random
from pprint import pprint
from itertools import combinations, product

import torch
from torch import nn
import torch.nn.functional as F
import torch_optimizer as optim
# import deepspeed
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    RobertaTokenizer,
    RobertaModel,
    RobertaForSequenceClassification,
    AutoModelWithLMHead,
    AutoModelForSeq2SeqLM,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    PreTrainedTokenizerFast, AddedToken
)
from datasets import load_dataset, load_metric
from tqdm.notebook import tqdm
from rank_bm25 import BM25Okapi

## Hyper parameters

In [2]:
TRAIN_DATA = "open/preprocess_bm25.csv"
VAL_DATA = "open/sample_train.csv"
SAMPLE_DATA = "open/sample_train.csv"
CODE_DATA_PATH = "open/code"
TEST_DATA = "open/test.csv"
SUBMISSION = 'open/sample_submission.csv'
TOKENIZER_MODEL = "microsoft/graphcodebert-base" # "microsoft/graphcodebert-base"
PRETRAINED_MODEL = "michiyasunaga/BioLinkBERT-base" # TODO: Large 모델 성능 실험
NUM_LABELS = 2
MAX_LEN = 512
BATCH = 32
NUM_WORKERS = 4
GRADIENT_CHECKPOINTING = True
GRADIENT_ACCUMULATION_STEPS = 4
EPOCHS = 5
INIT_LR = 5e-6
MAX_LR = 5e-3
WD = 1e-2
SEED = 42
TRAIN_TEST_SPLIT_RATIO = 0.1
TRAIN_SELECT_NUM = 200000
VAL_SELECT_NUM = int(TRAIN_SELECT_NUM * 0.1)
DEEPSPEED_CONFIG = "ds_config_zero2.json"
OUTPUT_DIR = "./results"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["MASTER_ADDR"] = "localhost"
# os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
# os.environ["RANK"] = "0"
# os.environ["LOCAL_RANK"] = "0"
# os.environ["WORLD_SIZE"] = "1"

## Fix seed

In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True  # True 할 시 연산속도 감소. 마지막에 고정시킬 때 사용 권장.
torch.backends.cudnn.benchmark = False
os.environ["PYTHONHASHSEED"] = str(SEED)

## Functions

In [4]:
def preprocess_function(examples):
    for i in range(1, 3):
        for j in range(len(examples[f"code{i}"])):
            examples[f"code{i}"][j] = re.sub("#.*", "", examples[f"code{i}"][j], flags=re.MULTILINE)
            examples[f"code{i}"][j] = re.sub('""".*?"""', "", examples[f"code{i}"][j], flags=re.S)
            examples[f"code{i}"][j] = re.sub("'''.*?'''", "", examples[f"code{i}"][j], flags=re.S)
            examples[f"code{i}"][j] = re.sub("b'.*?'", "b''", examples[f"code{i}"][j], flags=re.MULTILINE)
            examples[f"code{i}"][j] = re.sub('b".*?"', 'b""', examples[f"code{i}"][j], flags=re.MULTILINE)
            examples[f"code{i}"][j] = re.sub("^from .*? import .*?\n", "", examples[f"code{i}"][j], flags=re.MULTILINE) # TODO: 이거 포함시켜서 preprocess 하면 성능 향상 되는지 확인하기
            examples[f"code{i}"][j] = re.sub("^import .*?\n", "", examples[f"code{i}"][j], flags=re.MULTILINE)
            examples[f"code{i}"][j] = re.sub("@.*", "", examples[f"code{i}"][j], flags=re.MULTILINE)
            examples[f"code{i}"][j] = re.sub("^\n", "", examples[f"code{i}"][j], flags=re.MULTILINE)
            examples[f"code{i}"][j] = re.sub("^ *?\n", "", examples[f"code{i}"][j], flags=re.MULTILINE)
            examples[f"code{i}"][j] = re.sub("    ", "\t", examples[f"code{i}"][j], flags=re.MULTILINE)
        
    outputs = tokenizer(examples['code1'], examples['code2'], max_length=MAX_LEN, return_token_type_ids=True, truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

def metric_function(p):
    preds, labels = p
    output =  metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

def make_train_dataset_from_codefolder(path):
    scripts_list = []
    problem_nums = []

    for problem_folder in tqdm(os.listdir(path)):
        scripts = os.listdir(os.path.join(path, problem_folder))
        problem_num = scripts[0].split('_')[0]
        for script in scripts:
            script_file = os.path.join(path, problem_folder, script)
            with open(script_file, 'r', encoding='utf-8') as file:
                lines = file.read()
            lines = re.sub("#.*", "", lines, flags=re.MULTILINE)
            lines = re.sub('""".*?"""', "", lines, flags=re.S)
            lines = re.sub("'''.*?'''", "", lines, flags=re.S)
            lines = re.sub("b'.*?'", "b''", lines, flags=re.MULTILINE)
            lines = re.sub('b".*?"', 'b""', lines, flags=re.MULTILINE)
            lines = re.sub("^from .*? import .*?\n", "", lines, flags=re.MULTILINE) # TODO: 이거 포함시켜서 preprocess 하면 성능 향상 되는지 확인하기
            lines = re.sub("^import .*?\n", "", lines, flags=re.MULTILINE)
            lines = re.sub("@.*", "", lines, flags=re.MULTILINE)
            lines = re.sub("^\n", "", lines, flags=re.MULTILINE)
            lines = re.sub("^ *?\n", "", lines, flags=re.MULTILINE)
            lines = re.sub("    ", "\t", lines, flags=re.MULTILINE)
            scripts_list.append(lines)
        problem_nums.extend([problem_num]*len(scripts))

    df = pd.DataFrame(data = {'code':scripts_list, 'problem_num':problem_nums})
    print(f"Descirbe: \n{df.describe()}")
    print(f"Head: \n{df.head()}")
    print(f"Length: \n{len(df)}")

    df['tokens'] = df['code'].apply(tokenizer.tokenize)
    print(df['tokens'])
    df['len'] = df['tokens'].apply(len)
    print(f"Tokens Describe: \n{df.describe()}")

    ndf = df[df['len'] <= MAX_LEN].reset_index(drop=True)
    print(f"Max Length Clipping Describe: \n{ndf.describe()}")
    return ndf
    # train_df, val_df, _, _ = train_test_split(
    #     ndf,
    #     ndf['problem_num'],
    #     random_state=SEED,
    #     test_size=TRAIN_TEST_SPLIT_RATIO,
    #     stratify=ndf['problem_num'],
    # )
    # train_df = train_df.reset_index(drop=True)
    # val_df = val_df.reset_index(drop=True)
    # print("Done!")
    # return train_df, val_df

def preprocess_bm25(df, file_name="preprocess_bm25"):
    codes = df['code'].to_list()
    problems = df['problem_num'].unique().tolist()
    problems.sort()

    tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
    bm25 = BM25Okapi(tokenized_corpus)

    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(problems):
        solution_codes = df[df['problem_num'] == problem]['code']
        positive_pairs = list(combinations(solution_codes.to_list(),2))

        solution_codes_indices = solution_codes.index.to_list()
        negative_pairs = []

        first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
        negative_code_scores = bm25.get_scores(first_tokenized_code)
        negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
        ranking_idx = 0

        for solution_code in solution_codes:
            negative_solutions = []
            while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
                high_score_idx = negative_code_ranking[ranking_idx]

                if high_score_idx not in solution_codes_indices:
                    negative_solutions.append(df['code'].iloc[high_score_idx])
                ranking_idx += 1

            for negative_solution in negative_solutions:
                negative_pairs.append((solution_code, negative_solution))

        total_positive_pairs.extend(positive_pairs)
        total_negative_pairs.extend(negative_pairs)

    pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
    pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

    neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
    neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

    pos_label = [1]*len(pos_code1)
    neg_label = [0]*len(neg_code1)

    pos_code1.extend(neg_code1)
    total_code1 = pos_code1
    pos_code2.extend(neg_code2)
    total_code2 = pos_code2
    pos_label.extend(neg_label)
    total_label = pos_label
    pair_data = pd.DataFrame(data={
        'code1':total_code1,
        'code2':total_code2,
        'similar':total_label
    })
    pair_data = pair_data.sample(frac=1).reset_index(drop=True)

    pair_data.to_csv(f'open/{file_name}.csv',index=False)

    
# def preprocess_bm25(df, file_name="preprocess_bm25"):
#     codes = df['code'].to_list()
#     problems = df['problem_num'].unique().tolist()
#     problems.sort()

#     tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
#     bm25 = BM25Okapi(tokenized_corpus)

#     total_positive_pairs = []
#     total_negative_pairs = []

#     for problem in tqdm(problems):
#         solution_codes = df[df['problem_num'] == problem]['code']
#         positive_pairs = list(combinations(solution_codes.to_list(),2))

#         solution_codes_indices = solution_codes.index.to_list()
#         negative_pairs = []

#         first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
#         negative_code_scores = bm25.get_scores(first_tokenized_code)
#         negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
#         ranking_idx = 0

#         for solution_code in solution_codes:
#             negative_solutions = []
#             while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
#                 high_score_idx = negative_code_ranking[ranking_idx]

#                 if high_score_idx not in solution_codes_indices:
#                     negative_solutions.append(df['code'].iloc[high_score_idx])
#                 ranking_idx += 1

#             for negative_solution in negative_solutions:
#                 negative_pairs.append((solution_code, negative_solution))

#         total_positive_pairs.extend(positive_pairs)
#         total_negative_pairs.extend(negative_pairs)

#     pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
#     pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

#     neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
#     neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

#     pos_label = [1]*len(pos_code1)
#     neg_label = [0]*len(neg_code1)

#     pos_code1.extend(neg_code1)
#     total_code1 = pos_code1
#     pos_code2.extend(neg_code2)
#     total_code2 = pos_code2
#     pos_label.extend(neg_label)
#     total_label = pos_label
#     pair_data = pd.DataFrame(data={
#         'code1':total_code1,
#         'code2':total_code2,
#         'similar':total_label
#     })
#     pair_data = pair_data.sample(frac=1).reset_index(drop=True)
#     pair_data.to_csv(f'open/{file_name}.csv',index=False)

## Load Train / Test dataset

In [5]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)#, config="tokenizer_config.json")
# print(model.config)
# tokenizer.save("tokenizer.json")
# tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer_config.json")

# print(tokenizer)
######################## PREPROCESS ################################
ndf = make_train_dataset_from_codefolder(CODE_DATA_PATH)
preprocess_bm25(ndf)

# train_df, val_df = make_train_dataset_from_codefolder(CODE_DATA_PATH)
# preprocess_bm25(train_df, "bm25_train_xlnet")
# preprocess_bm25(val_df, "bm25_val_xlnet")
# z
####################################################################
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

train_dataset = load_dataset("csv", data_files=TRAIN_DATA)['train']
# train_dataset = train_dataset.shuffle(seed=SEED).select(range(2000))
train_dataset = train_dataset.select(range(TRAIN_SELECT_NUM))
train_dataset = train_dataset.map(
    preprocess_function,
    remove_columns=['code1', 'code2', 'similar'],
    load_from_cache_file=False,
    batched=True
)

val_dataset = load_dataset("csv", data_files=VAL_DATA)['train']
# val_dataset = val_dataset.shuffle(seed=SEED).select(range(200))
# val_dataset = val_dataset.select(range(VAL_SELECT_NUM))
val_dataset = val_dataset.map(
    preprocess_function,
    remove_columns=['code1', 'code2', 'similar'],
    load_from_cache_file=False,
    batched=True
)
# train_dataset = train_dataset.train_test_split(TRAIN_TEST_SPLIT_RATIO, load_from_cache_file=False)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 10-fold cross-validation
# val_ds = load_dataset("csv", data_files=TRAIN_DATA, split=[f"train[{k}%:{k+10}%]" for k in range(0, 100, 10)])
# train_ds = load_dataset("csv", data_files=TRAIN_DATA, split=[f"train[:{k}%]+train[{k+10}%:]" for k in range(0, 100, 10)])

  0%|          | 0/300 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors


Descirbe: 
                                                     code problem_num
count                                               45101       45101
unique                                              45089         300
top     n, m = list(map(int, input().split()))\nc = li...  problem262
freq                                                    2         153
Head: 
                                                code problem_num
0  dp = [0, 1]\ns = input()\ns = s[::-1]\ns += "0...  problem219
1  m=str(raw_input())\nn=[int(i) for i in m]\nn.i...  problem219
2  s = input()\nINF = float('inf')\ndp = [[INF,IN...  problem219
3  n = input()[::-1]\ndp = [[0, 0] for i in range...  problem219
4  n = str(input())\nn_list = list(reversed(n))\n...  problem219
Length: 
45101
0        [dp, Ġ=, Ġ[, 0, ,, Ġ1, ], Ċ, s, Ġ=, Ġinput, ()...
1        [m, =, str, (, raw, _, input, ()), Ċ, n, =[, i...
2        [s, Ġ=, Ġinput, (), Ċ, IN, F, Ġ=, Ġfloat, (', ...
3        [n, Ġ=, Ġinput, (), [, ::, -, 1, ], Ċ, dp

  0%|          | 0/300 [00:00<?, ?it/s]

Using custom data configuration default-c6341f23fb91ff84


Downloading and preparing dataset csv/default to /home/djlee/.cache/huggingface/datasets/csv/default-c6341f23fb91ff84/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/djlee/.cache/huggingface/datasets/csv/default-c6341f23fb91ff84/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/200 [00:00<?, ?ba/s]

Using custom data configuration default-1d2e243c78cf06c7
Reusing dataset csv (/home/djlee/.cache/huggingface/datasets/csv/default-1d2e243c78cf06c7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

## Define Model and Train

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL) # TODO: RobertaForSequenceClassification 로 바꾸기

# TODO: Lamb 성능 확인하기
# optimizer = optim.Lamb(
#     model.parameters(),
#     lr=INIT_LR,
#     betas=(0.9, 0.999),
#     eps=1e-8,
#     weight_decay=WD,
# )
# scheduler = torch.optim.lr_scheduler.OneCycleLR(
#     optimizer,
#     max_lr=MAX_LR,
#     steps_per_epoch=round(len(train_dataset)/BATCH/GRADIENT_ACCUMULATION_STEPS),
#     epochs=EPOCHS,
# )
metric = load_metric("glue", "mrpc") # "glue", "sst2" , "stsb", "mrpc" or "accuracy"
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCHS,
    do_train=True,
    do_eval=True,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, # TODO: 높일 수록 배치 사이즈 증가하는 효과. 성능 더 좋아 질 수도
    gradient_checkpointing=True, # XLNET 미지원
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # deepspeed=DEEPSPEED_CONFIG,
    # auto_find_batch_size=True,
    dataloader_num_workers=NUM_WORKERS,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # optimizers=(optimizer, scheduler), # TODO: 성능 확인
    compute_metrics=metric_function,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at michiyasunaga/BioLinkBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using amp half precision backend
***** Running training *****
  Num examples = 200000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 7810
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdjlee[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1038,0.092736,0.969894,0.969666
1,0.0429,0.052839,0.982471,0.982427
2,0.0233,0.070938,0.981358,0.981365
3,0.0117,0.070066,0.984085,0.984012
4,0.0048,0.086356,0.984418,0.984335


***** Running Evaluation *****
  Num examples = 17970
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1562
Configuration saved in ./results/checkpoint-1562/config.json
Model weights saved in ./results/checkpoint-1562/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1562/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1562/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 17970
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-3124
Configuration saved in ./results/checkpoint-3124/config.json
Model weights saved in ./results/checkpoint-3124/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-3124/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-3124/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 17970
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-4686
Configuration saved in ./results/checkpoint-4686/conf

TrainOutput(global_step=7810, training_loss=0.037283038055087786, metrics={'train_runtime': 28459.0861, 'train_samples_per_second': 35.138, 'train_steps_per_second': 0.274, 'total_flos': 2.6242867683792384e+17, 'train_loss': 0.037283038055087786, 'epoch': 5.0})

## Predict

In [7]:
# 모델 예측
test_dataset = load_dataset("csv", data_files=TEST_DATA)['train']
# test_dataset = test_dataset.shuffle(seed=SEED).select(range(40))
test_dataset = test_dataset.map(
    preprocess_function,
    remove_columns=['code1', 'code2'],
    load_from_cache_file=False,
    batched=True
)
predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUBMISSION)
df['similar'] = np.argmax(predictions.predictions, axis=-1)
df.to_csv('./submission.csv', index=False)

Using custom data configuration default-390241dd22cb4626
Reusing dataset csv (/home/djlee/.cache/huggingface/datasets/csv/default-390241dd22cb4626/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/180 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 32
