## Import

In [1]:
import logging
import logging.config
import re
import os
import random
import datetime
from pprint import pprint
from itertools import combinations

import torch
from torch.utils.data import DataLoader
from torch.utils.checkpoint import checkpoint_sequential

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, get_scheduler
from datasets import load_dataset, load_metric
from accelerate import Accelerator

from tqdm.auto import tqdm
from rank_bm25 import BM25Okapi
import torch_optimizer as optim

## Hyper parameters

In [2]:
TRAIN_DATA = "open/preprocess_bm25.csv"
SAMPLE_DATA = "open/sample_train.csv"
CODE_DATA_PATH = "open/code"
TEST_DATA = "open/test.csv"
SUBMISSION = 'open/sample_submission.csv'
PRETRAINED_MODEL = "michiyasunaga/LinkBERT-base" # TODO: Large 모델 성능 실험
NUM_LABELS = 2
MAX_LEN = 512
BATCH = 32
GRADIENT_ACCUMULATION_STEPS = 4
GRADIENT_CHECKPOINTING = True
EPOCHS = 5
LR = 2e-5
WD = 1e-2
SEED = 42
TRAIN_TEST_SPLIT_RATIO = 0.1
OUTPUT_DIR = "./results"
SAVE_MODEL = f"{PRETRAINED_MODEL}_{datetime.datetime.now().strftime('%H:%M:%S:%m')}"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

## Logging

In [3]:
config = {
    "version": 1,
    "formatters": {
        "simple": {"format": "[%(asctime)s] %(message)s", "datefmt": "%Y-%m-%d %H:%M:%S"},
    },
    "handlers": {
        "console": {
            "class": "logging.StreamHandler",
            "formatter": "simple",
            "level": "INFO",
        },
        "file": {
            "class": "logging.FileHandler",
            "filename": f"{datetime.datetime.now().strftime('%H:%M:%S:%m')}.log",
            "formatter": "simple",
            "level": "INFO",
        },
    },
    "root": {"handlers": ["console", "file"], "level": "INFO"},
    "loggers": {"parent": {"level": "INFO"}, "parent.child": {"level": "DEBUG"},},
}

logging.config.dictConfig(config)
logger = logging.getLogger()

## Fix seed

In [4]:
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = False  # True 할 시 연산속도 감소. 마지막에 고정시킬 때 사용 권장.
torch.backends.cudnn.benchmark = True

## Class & Functions

### Utils

In [5]:
class AverageMeter(object):

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
        
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
def preprocess_function(examples):
    for i in range(1, 3):
        for j in range(len(examples[f"code{i}"])):
            examples[f"code{i}"][j] = re.sub(r"^#.*", "", examples[f"code{i}"][j], flags=re.MULTILINE)
            examples[f"code{i}"][j] = re.sub(r'""".*?"""', "", examples[f"code{i}"][j], flags=re.S)
            examples[f"code{i}"][j] = re.sub(r"^\n", "", examples[f"code{i}"][j], flags=re.MULTILINE)
    outputs = tokenizer(examples['code1'], examples['code2'], padding="max_length", max_length=MAX_LEN, truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

def save_model(save_name, model, optimizer, epoch, train_loss):
    torch.save({
        "epoch": epoch,
        "total_epoch": EPOCHS,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": train_loss,
    }, f"{save_name}.pt")

### Preprocess script to csv

In [6]:
# def make_train_dataset_from_codefolder(path):
#     scripts_list = []
#     problem_nums = []

#     for problem_folder in tqdm(os.listdir(path)):
#         scripts = os.listdir(os.path.join(path, problem_folder))
#         problem_num = scripts[0].split('_')[0]
#         for script in scripts:
#             script_file = os.path.join(path, problem_folder, script)
#             with open(script_file, 'r', encoding='utf-8') as file:
#                 lines = file.read()
#             scripts_list.append(lines)
#         problem_nums.extend([problem_num]*len(scripts))

#     df = pd.DataFrame(data = {'code':scripts_list, 'problem_num':problem_nums})
#     logger.info(f"Descirbe: \n{df.describe()}")
#     logger.info(f"Head: \n{df.head()}")
#     logger.info(f"Length: \n{len(df)}")

#     df['tokens'] = df['code'].apply(tokenizer.tokenize)
#     df['len'] = df['tokens'].apply(len)
#     logger.info(f"Tokens Describe: \n{df.describe()}")

#     ndf = df[df['len'] <= 512].reset_index(drop=True)
#     logger.info(f"Max Length Clipping Describe: \n{ndf.describe()}")
#     logger.info("Done!")
#     return ndf
    
# def preprocess_bm25(df, file_name="preprocess_bm25"):
#     codes = df['code'].to_list()
#     problems = df['problem_num'].unique().tolist()
#     problems.sort()

#     tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
#     bm25 = BM25Okapi(tokenized_corpus)

#     total_positive_pairs = []
#     total_negative_pairs = []

#     for problem in tqdm(problems):
#         solution_codes = df[df['problem_num'] == problem]['code']
#         positive_pairs = list(combinations(solution_codes.to_list(),2))

#         solution_codes_indices = solution_codes.index.to_list()
#         negative_pairs = []

#         first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
#         negative_code_scores = bm25.get_scores(first_tokenized_code)
#         negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
#         ranking_idx = 0

#         for solution_code in solution_codes:
#             negative_solutions = []
#             while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
#                 high_score_idx = negative_code_ranking[ranking_idx]

#                 if high_score_idx not in solution_codes_indices:
#                     negative_solutions.append(df['code'].iloc[high_score_idx])
#                 ranking_idx += 1

#             for negative_solution in negative_solutions:
#                 negative_pairs.append((solution_code, negative_solution))

#         total_positive_pairs.extend(positive_pairs)
#         total_negative_pairs.extend(negative_pairs)

#     pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
#     pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

#     neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
#     neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

#     pos_label = [1]*len(pos_code1)
#     neg_label = [0]*len(neg_code1)

#     pos_code1.extend(neg_code1)
#     total_code1 = pos_code1
#     pos_code2.extend(neg_code2)
#     total_code2 = pos_code2
#     pos_label.extend(neg_label)
#     total_label = pos_label
#     pair_data = pd.DataFrame(data={
#         'code1':total_code1,
#         'code2':total_code2,
#         'similar':total_label
#     })
#     pair_data = pair_data.sample(frac=1).reset_index(drop=True)
#     pair_data.to_csv(f'open/{file_name}.csv',index=False)

### Trainer

In [7]:
def trainer(model, optimizer, dataloader):
    train_loss = AverageMeter()
    model.train()
    with tqdm(dataloader, total=len(dataloader), unit="batch") as train_bar:
        for idx, batch in enumerate(train_bar, start=1):
            outputs = model(**batch)
            loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
            accelerator.backward(loss)
            if (idx % GRADIENT_ACCUMULATION_STEPS == 0) or (idx == len(dataloader)):
                optimizer.step()
                optimizer.zero_grad()
                train_loss.update(loss.item(), BATCH)
                train_bar.set_postfix(train_loss=loss.item())
    return train_loss.avg

def valid(model, dataloader):
    metric = load_metric("accuracy")
    val_loss = AverageMeter()
    model.eval()
    with tqdm(dataloader, total=len(dataloader), unit="batch") as val_bar:
        for idx, batch in enumerate(val_bar):
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
            if (idx % GRADIENT_ACCUMULATION_STEPS == 0) or (idx == len(dataloader)):
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                metric.add_batch(predictions=predictions, references=batch["labels"])
                val_loss.update(loss.item(), BATCH)
                val_bar.set_postfix(val_loss=loss.item())
    accuracy = metric.compute()["accuracy"]
    return val_loss.avg, accuracy

def predict(model, dataloader):
    pred_list = []
    model.eval()
    for batch in tqdm(dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1).tolist()
        pred_list.extend(predictions)
    return pred_list

## Load Train / Val / Test dataset

In [8]:
# preprocess code folder to csv
# df = make_train_dataset_from_codefolder(CODE_DATA_PATH)
# preprocess_bm25(df)

dataset = load_dataset("csv", data_files=TRAIN_DATA)['train']
dataset = dataset.shuffle(seed=SEED).select(range(40000))
dataset = dataset.map(
    preprocess_function,
    remove_columns=['code1', 'code2', 'similar'],
    load_from_cache_file=False,
    batched=True
)
dataset = dataset.train_test_split(
    TRAIN_TEST_SPLIT_RATIO,
    load_from_cache_file=False
)
dataset.set_format("torch")

train_dataloader = DataLoader(
    dataset["train"],
    shuffle=True,
    batch_size=BATCH,
    pin_memory=True,
)

val_dataloader = DataLoader(
    dataset["test"],
    batch_size=BATCH,
    pin_memory=True,
)

test_dataset = load_dataset("csv", data_files=TEST_DATA)['train']
test_dataset = test_dataset.map(
    preprocess_function,
    remove_columns=['pair_id', 'code1', 'code2'],
    load_from_cache_file=True,
    batched=True
)
test_dataset.set_format("torch")
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH,
    pin_memory=True,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

## Define Model

In [9]:
accelerator = Accelerator(
    fp16=True,
    log_with="wandb"
)

model = AutoModelForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL,
    num_labels=NUM_LABELS
)

optimizer = optim.Lamb(
    model.parameters(),
    lr=LR,
    betas=(0.9, 0.999),
    eps=1e-6,
    weight_decay=WD
)

# scheduler = get_scheduler(
#     name="linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=EPOCHS*len(train_dataloader)
# )

if GRADIENT_CHECKPOINTING:
    model.gradient_checkpointing_enable()

model, optimizer, train_dataloader, val_dataloader, test_dataloader = accelerator.prepare(model, optimizer, train_dataloader, val_dataloader, test_dataloader)

## Train

In [None]:
best_accuracy = 0
for epoch in range(EPOCHS):
    train_loss = trainer(model, optimizer, train_dataloader)
    val_loss, val_accuracy = valid(model, val_dataloader)
    if val_accuracy > best_accuracy:
        save_model("best", model, optimizer, epoch, train_loss)
    save_model("last", model, optimizer, epoch, train_loss)
    logger.info(f"epoch:{epoch}/{EPOCHS} | train -> loss:{train_loss}")
    logger.info(f"epoch:{epoch}/{EPOCHS} | validation -> loss:{val_loss} | acc: {val_accuracy}")

  0%|          | 0/1125 [00:00<?, ?batch/s]

  0%|          | 0/125 [00:00<?, ?batch/s]

[2022-05-22 19:04:48] epoch:0/5 | train -> loss:0.1455546964144876
[2022-05-22 19:04:48] epoch:0/5 | validation -> loss:0.1161816418170929 | acc: 0.7861328125


  0%|          | 0/1125 [00:00<?, ?batch/s]

  0%|          | 0/125 [00:00<?, ?batch/s]

[2022-05-22 19:22:22] epoch:1/5 | train -> loss:0.1012621762904715
[2022-05-22 19:22:22] epoch:1/5 | validation -> loss:0.09480004012584686 | acc: 0.8359375


  0%|          | 0/1125 [00:00<?, ?batch/s]

  0%|          | 0/125 [00:00<?, ?batch/s]

[2022-05-22 19:39:59] epoch:2/5 | train -> loss:0.08172469950736837
[2022-05-22 19:39:59] epoch:2/5 | validation -> loss:0.07689379900693893 | acc: 0.87109375


  0%|          | 0/1125 [00:00<?, ?batch/s]

## Predict

In [None]:
predictions = predict(model, test_dataloader)

df = pd.read_csv(SUBMISSION)
df['similar'] = predictions
df.to_csv('./submission.csv', index=False)