## Import

In [1]:
import re
import os
import random
import torch
from pprint import pprint
from itertools import combinations

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from tqdm.notebook import tqdm
from rank_bm25 import BM25Okapi
import torch_optimizer as optim

## Hyper parameters

In [2]:
TRAIN_DATA = "open/preprocess_bm25.csv"
SAMPLE_DATA = "open/sample_train.csv"
CODE_DATA_PATH = "open/code"
TEST_DATA = "open/test.csv"
SUBMISSION = 'open/sample_submission.csv'
PRETRAINED_MODEL = "michiyasunaga/LinkBERT-base" # TODO: Large 모델 성능 실험
NUM_LABELS = 2
MAX_LEN = 512
BATCH = 32
GRADIENT_ACCUMULATION_STEPS = 4
EPOCHS = 5
LR = 2e-5
WD = 1e-2
SEED = 42
TRAIN_TEST_SPLIT_RATIO = 0.1
OUTPUT_DIR = "./results"

## Fix seed

In [3]:
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = False  # True 할 시 연산속도 감소. 마지막에 고정시킬 때 사용 권장.
torch.backends.cudnn.benchmark = True

## Functions

In [4]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
def preprocess_function(examples):
    for i in range(1, 3):
        examples[f"code{i}"] = re.sub(r"^#.*", "", examples[f"code{i}"], flags=re.MULTILINE)
        examples[f"code{i}"] = re.sub(r'""".*?"""', "", examples[f"code{i}"], flags=re.S)
        examples[f"code{i}"] = re.sub(r"^\n", "", examples[f"code{i}"], flags=re.MULTILINE)
        
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN, truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

metric = load_metric("glue", "sst2") # "glue", "sst2" or "accuracy"
def metric_function(p):
    preds, labels = p
    output =  metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

# def make_train_dataset_from_codefolder(path):
#     scripts_list = []
#     problem_nums = []

#     for problem_folder in tqdm(os.listdir(path)):
#         scripts = os.listdir(os.path.join(path, problem_folder))
#         problem_num = scripts[0].split('_')[0]
#         for script in scripts:
#             script_file = os.path.join(path, problem_folder, script)
#             with open(script_file, 'r', encoding='utf-8') as file:
#                 lines = file.read()
#             scripts_list.append(lines)
#         problem_nums.extend([problem_num]*len(scripts))

#     df = pd.DataFrame(data = {'code':scripts_list, 'problem_num':problem_nums})
#     print(f"Descirbe: \n{df.describe()}")
#     print(f"Head: \n{df.head()}")
#     print(f"Length: \n{len(df)}")

#     df['tokens'] = df['code'].apply(tokenizer.tokenize)
#     df['len'] = df['tokens'].apply(len)
#     print(f"Tokens Describe: \n{df.describe()}")

#     ndf = df[df['len'] <= 512].reset_index(drop=True)
#     print(f"Max Length Clipping Describe: \n{ndf.describe()}")
#     print("Done!")
#     return ndf
    
# def preprocess_bm25(df, file_name="preprocess_bm25"):
#     codes = df['code'].to_list()
#     problems = df['problem_num'].unique().tolist()
#     problems.sort()

#     tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
#     bm25 = BM25Okapi(tokenized_corpus)

#     total_positive_pairs = []
#     total_negative_pairs = []

#     for problem in tqdm(problems):
#         solution_codes = df[df['problem_num'] == problem]['code']
#         positive_pairs = list(combinations(solution_codes.to_list(),2))

#         solution_codes_indices = solution_codes.index.to_list()
#         negative_pairs = []

#         first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
#         negative_code_scores = bm25.get_scores(first_tokenized_code)
#         negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
#         ranking_idx = 0

#         for solution_code in solution_codes:
#             negative_solutions = []
#             while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
#                 high_score_idx = negative_code_ranking[ranking_idx]

#                 if high_score_idx not in solution_codes_indices:
#                     negative_solutions.append(df['code'].iloc[high_score_idx])
#                 ranking_idx += 1

#             for negative_solution in negative_solutions:
#                 negative_pairs.append((solution_code, negative_solution))

#         total_positive_pairs.extend(positive_pairs)
#         total_negative_pairs.extend(negative_pairs)

#     pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
#     pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

#     neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
#     neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

#     pos_label = [1]*len(pos_code1)
#     neg_label = [0]*len(neg_code1)

#     pos_code1.extend(neg_code1)
#     total_code1 = pos_code1
#     pos_code2.extend(neg_code2)
#     total_code2 = pos_code2
#     pos_label.extend(neg_label)
#     total_label = pos_label
#     pair_data = pd.DataFrame(data={
#         'code1':total_code1,
#         'code2':total_code2,
#         'similar':total_label
#     })
#     pair_data = pair_data.sample(frac=1).reset_index(drop=True)
#     pair_data.to_csv(f'open/{file_name}.csv',index=False)

## Load Train / Test dataset

In [5]:
# df = make_train_dataset_from_codefolder(CODE_DATA_PATH)
# preprocess_bm25(df)

dataset = load_dataset("csv", data_files=TRAIN_DATA)['train']
dataset = dataset.map(
    preprocess_function,
    remove_columns=['code1', 'code2', 'similar'],
    load_from_cache_file=True,
    cache_file_name="sample_train",
    batched=True
)
dataset = dataset.shuffle(seed=SEED).select(range(40000))
dataset = dataset.train_test_split(TRAIN_TEST_SPLIT_RATIO)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 10-fold cross-validation
# val_ds = load_dataset("csv", data_files=TRAIN_DATA, split=[f"train[{k}%:{k+10}%]" for k in range(0, 100, 10)])
# train_ds = load_dataset("csv", data_files=TRAIN_DATA, split=[f"train[:{k}%]+train[{k+10}%:]" for k in range(0, 100, 10)])

Using custom data configuration default-1a11ee059614c6a4
Reusing dataset csv (/home/djlee/.cache/huggingface/datasets/csv/default-1a11ee059614c6a4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at sample_train
Loading cached shuffled indices for dataset at cache-8c47d3c8f099c300.arrow


## Define Model and Train

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=NUM_LABELS)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCHS,
    weight_decay=WD,
    do_train=True,
    do_eval=True,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, # TODO: 높일 수록 배치 사이즈 증가하는 효과. 성능 더 좋아 질 수도
    gradient_checkpointing=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    label_smoothing_factor=0.1,
    # optim=optim.Lamb(model.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-6, weight_decay=WD) # TODO: Lamb 성능 테스트
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_function
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at michiyasunaga/LinkBERT-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using amp half precision backend
***** Running training *****
  Num examples = 36000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 1405
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdjlee[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
0,0.35,0.287115,0.94575
1,0.2696,0.272454,0.95625


***** Running Evaluation *****
  Num examples = 4000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-281
Configuration saved in ./results/checkpoint-281/config.json
Model weights saved in ./results/checkpoint-281/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-281/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-281/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-562
Configuration saved in ./results/checkpoint-562/config.json
Model weights saved in ./results/checkpoint-562/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-562/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-562/special_tokens_map.json


## Inference

In [None]:
# 모델 추론
test_dataset = load_dataset("csv", data_files=TEST_DATA)['train']
test_dataset = test_dataset.map(preprocess_function, remove_columns=['code1', 'code2'])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUBMISSION)
df['similar'] = np.argmax(predictions.predictions, axis=-1)
df.to_csv('./submission.csv', index=False)