In [1]:
import os
from tqdm import tqdm

In [2]:
code_folder = "./code"
problem_folders = os.listdir(code_folder)

In [3]:
def preprocess_script(script):
    with open(script, "r", encoding="utf-8") as file:
        lines = file.readlines()
        preproc_lines = list()
        for line in lines:
            if line.lstrip().startswith("#"):
                continue
            line = line.rstrip()
            if "#" in line:
                line = line[:line.index("#")]
            line = line.replace("\n", "")
            line = line.replace("    ", "\t")
            if line == "":
                continue
            preproc_lines.append(line)
        preprocessed_script = "\n".join(preproc_lines)
    return preprocessed_script

preproc_scripts = list()
problem_nums = list()

for problem_folder in tqdm(problem_folders):
    scripts = os.listdir(os.path.join(code_folder, problem_folder))
    problem_num = scripts[0].split("_")[0]
    for script in scripts:
        script_file = os.path.join(code_folder, problem_folder, script)
        preprocessed_script = preprocess_script(script_file)
        
        preproc_scripts.append(preprocessed_script)
    problem_nums.extend([problem_num]*len(scripts))

100%|█████████████████████████████████████████| 300/300 [00:08<00:00, 35.13it/s]


In [6]:
import pandas as pd
df = pd.DataFrame(data = {"code":preproc_scripts, "problem_num":problem_nums})

In [7]:
temp_dict = dict()
for code, prob_num in zip(df["code"], df["problem_num"]):
    if prob_num in temp_dict:
        temp_dict[prob_num] += 1
    else:
        temp_dict[prob_num] = 1

# print(temp_dict)

In [8]:
len(df)

45101

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
df["tokens"] = df["code"].apply(tokenizer.tokenize)
df["len"] = df["tokens"].apply(len)
df.describe()

Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,len
count,45101.0
mean,160.123789
std,500.930345
min,5.0
25%,61.0
50%,108.0
75%,200.0
max,97566.0


In [10]:
ndf = df[df["len"] <= 512].reset_index(drop=True)
ndf.describe()

Unnamed: 0,len
count,43647.0
mean,137.920842
std,104.933475
min,5.0
25%,60.0
50%,104.0
75%,187.0
max,512.0


In [12]:
data_frame = pd.read_csv("./data/test.csv")

In [13]:
data_frame["code1_len"] = data_frame["code1"].apply(len)
data_frame["code1_len"].describe()

count    179700.000000
mean        392.408347
std         923.698933
min          20.000000
25%         153.000000
50%         255.000000
75%         489.000000
max      203699.000000
Name: code1_len, dtype: float64

In [14]:
data_frame["code2_len"] = data_frame["code2"].apply(len)
data_frame["code2_len"].describe()

count    179700.000000
mean        390.010367
std        1333.079216
min          15.000000
25%         146.000000
50%         254.000000
75%         477.000000
max      203669.000000
Name: code2_len, dtype: float64

In [15]:
from sklearn.model_selection import train_test_split

train_df, valid_df, train_label, valid_label = train_test_split(
    ndf,
    ndf["problem_num"],
    random_state=42,
    test_size=0.1,
    stratify=ndf["problem_num"],
)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [16]:
from rank_bm25 import BM25Okapi
from itertools import combinations

#### Train negative pair 구성

In [57]:
codes = train_df["code"].to_list()
problems = train_df["problem_num"].unique().tolist()
problems.sort()

tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
bm25 = BM25Okapi(tokenized_corpus)

total_positive_pairs = list()
total_negative_pairs = list()

for problem in tqdm(problems):
    solution_codes = train_df[train_df["problem_num"] == problem]["code"]
    positive_pairs = list(combinations(solution_codes.to_list(), 2))
    
    solution_codes_indices = solution_codes.index.to_list()
    negative_pairs = list()
    
    first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
    negative_code_scores = bm25.get_scores(first_tokenized_code)
    negative_code_ranking = negative_code_scores.argsort()[::-1]
    ranking_idx = 0
    
    for solution_code in solution_codes:
        negative_solutions = list()
        while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
            high_score_idx = negative_code_ranking[ranking_idx]
            
            if high_score_idx not in solution_codes_indices:
                negative_solutions.append(train_df["code"].iloc[high_score_idx])
            ranking_idx += 1
        
        for negative_solution in negative_solutions:
            negative_pairs.append((solution_code, negative_solution))
    
    total_positive_pairs.extend(positive_pairs)
    total_negative_pairs.extend(negative_pairs)

pos_code1 = list(map(lambda x : x[0], total_positive_pairs))
pos_code2 = list(map(lambda x : x[1], total_positive_pairs))

neg_code1 = list(map(lambda x : x[0], total_negative_pairs))
neg_code2 = list(map(lambda x : x[1], total_negative_pairs))

pos_label = [1] * len(pos_code1)
neg_label = [0] * len(neg_code1)

pos_code1.extend(neg_code1)
total_code1 = pos_code1
pos_code2.extend(neg_code2)
total_code2 = pos_code2
pos_label.extend(neg_label)
total_label = pos_label
pair_data = pd.DataFrame(data={
    "code1" : total_code1,
    "code2" : total_code2,
    "similar" : total_label
})
pair_data = pair_data.sample(frac=1).reset_index(drop=True)

pair_data.to_csv("./data/train_data.csv", index=False)

100%|█████████████████████████████████████████| 300/300 [14:45<00:00,  2.95s/it]


#### Validation negative pair 구성

In [58]:
codes = valid_df["code"].to_list()
problems = valid_df["problem_num"].unique().tolist()
problems.sort()

tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
bm25 = BM25Okapi(tokenized_corpus)

total_positive_pairs = list()
total_negative_pairs = list()

for problem in tqdm(problems):
    solution_codes = valid_df[valid_df["problem_num"] == problem]["code"]
    positive_pairs = list(combinations(solution_codes.to_list(), 2))
    
    solution_codes_indices = solution_codes.index.to_list()
    negative_pairs = list()
    
    first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
    negative_code_scores = bm25.get_scores(first_tokenized_code)
    negative_code_ranking = negative_code_scores.argsort()[::-1]
    ranking_idx = 0
    
    for solution_code in solution_codes:
        negative_solutions = list()
        while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
            high_score_idx = negative_code_ranking[ranking_idx]
            
            if high_score_idx not in solution_codes_indices:
                negative_solutions.append(valid_df["code"].iloc[high_score_idx])
            ranking_idx += 1
        
        for negative_solution in negative_solutions:
            negative_pairs.append((solution_code, negative_solution))
    
    total_positive_pairs.extend(positive_pairs)
    total_negative_pairs.extend(negative_pairs)

pos_code1 = list(map(lambda x : x[0], total_positive_pairs))
pos_code2 = list(map(lambda x : x[1], total_positive_pairs))

neg_code1 = list(map(lambda x : x[0], total_negative_pairs))
neg_code2 = list(map(lambda x : x[1], total_negative_pairs))

pos_label = [1] * len(pos_code1)
neg_label = [0] * len(neg_code1)

pos_code1.extend(neg_code1)
total_code1 = pos_code1
pos_code2.extend(neg_code2)
total_code2 = pos_code2
pos_label.extend(neg_label)
total_label = pos_label
pair_data = pd.DataFrame(data={
    "code1" : total_code1,
    "code2" : total_code2,
    "similar" : total_label
})
pair_data = pair_data.sample(frac=1).reset_index(drop=True)

pair_data.to_csv("./data/valid_data.csv", index=False)

100%|█████████████████████████████████████████| 300/300 [00:55<00:00,  5.39it/s]


#### Train Part

In [22]:
import torch
from transformers import AutoModel, RobertaForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModel.from_pretrained("microsoft/graphcodebert-base")
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
model.to(device)

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
# dir(model)

In [17]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
import numpy as np
from datasets import load_dataset, load_metric

2022-05-26 16:47:52.018573: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [18]:
MODEL = "microsoft/graphcodebert-base"
INPUT = "./data/train_data.csv"
VAL_INPUT = "./data/valid_data.csv"
MAX_LEN = 512

dataset = load_dataset("csv", data_files=INPUT)['train']
val_dataset = load_dataset("csv", data_files=VAL_INPUT)["train"]
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

dataset = dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])
val_dataset = val_dataset.map(example_fn, remove_columns=["code1", "code2", "similar"])
    
# model = RobertaForSequenceClassification.from_pretrained(MODEL) # RobertaForSequenceClassification 는 BertForSequenceClassification 와 달리 pooler가 없는게 기본이기 때문에 문장 유사도에 사용 가능.


Using custom data configuration default-ce35a410e25f9cbd
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-ce35a410e25f9cbd/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-cc9a8ee57a12244a
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-cc9a8ee57a12244a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5133767 [00:00<?, ?ex/s]

  0%|          | 0/59389 [00:00<?, ?ex/s]

In [20]:
# collator : dataset에서 뽑아온 instance들을 batch형태로 만들어 주는 역할을 수행
# 기본적인 graphcodeBert는 수용량이 514. 그래서 256짜리 token 두개 비교하면 끝인데
# 얘 덕에 length 512짜리 문장 두개를 비교할 수 있는거 같음
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

In [23]:
args = TrainingArguments(
    './runs/',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    do_train=True,
    do_eval=True,
    #save_strategy="epoch",
    save_strategy="steps",
    #logging_strategy="epoch",
    logging_strategy="steps",
    #evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=1e-5,
    #metric_for_best_model= "f1",
    load_best_model_at_end=True,
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics= metric_fn,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=20)]
)

In [230]:
# gpu cashe clear
import gc
gc.collect()
torch.cuda.empty_cache()

trainer.train()

***** Running training *****
  Num examples = 5133767
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1925163


Step,Training Loss,Validation Loss,Accuracy
500,0.0064,0.502069,0.945495
1000,0.0181,0.316342,0.950681
1500,0.0512,0.319337,0.949351
2000,0.0432,0.330152,0.953965
2500,0.057,0.276938,0.95447
3000,0.0553,0.254684,0.956406
3500,0.0567,0.258361,0.955968
4000,0.045,0.285095,0.957214
4500,0.0433,0.264958,0.955345
5000,0.0445,0.272171,0.95319


***** Running Evaluation *****
  Num examples = 59389
  Batch size = 32
Saving model checkpoint to ./runs/checkpoint-500
Configuration saved in ./runs/checkpoint-500/config.json
Model weights saved in ./runs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./runs/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./runs/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59389
  Batch size = 32
Saving model checkpoint to ./runs/checkpoint-1000
Configuration saved in ./runs/checkpoint-1000/config.json
Model weights saved in ./runs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./runs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./runs/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59389
  Batch size = 32
Saving model checkpoint to ./runs/checkpoint-1500
Configuration saved in ./runs/checkpoint-1500/config.json
Model weights saved in ./runs/che

RuntimeError: [enforce fail at inline_container.cc:300] . unexpected pos 678533440 vs 678533328

In [24]:
model = RobertaForSequenceClassification.from_pretrained("./runs/checkpoint-53500")
model.to(device)

args = TrainingArguments(
    './runs/',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    do_train=True,
    do_eval=True,
    #save_strategy="epoch",
    save_strategy="steps",
    #logging_strategy="epoch",
    logging_strategy="steps",
    #evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=1e-5,
    #metric_for_best_model= "f1",
    load_best_model_at_end=True,
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics= metric_fn,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=20)]
)

loading configuration file ./runs/checkpoint-53500/config.json
Model config RobertaConfig {
  "_name_or_path": "microsoft/graphcodebert-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file ./runs/checkpoint-53500/pytorch_model.bin
All model checkpoint weights were use

In [25]:
import gc
gc.collect()
torch.cuda.empty_cache()

import pandas as pd

TEST = "./data/test.csv"
SUB = "./data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)["train"]
test_dataset = test_dataset.map(example_fn, remove_columns=["code1", "code2"])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df["similar"] = np.argmax(predictions.predictions, axis=-1)
df.to_csv("./submissions/submission.csv", index=False)

Using custom data configuration default-e82830c223845fb2
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-e82830c223845fb2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/179700 [00:00<?, ?ex/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 32


In [42]:
model_list = list(model.children())
print(model_list[0])
print("-" * 100)
print(model_list[1])

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop