In [1]:
import os
from tqdm import tqdm

In [2]:
code_folder = "./code"
problem_folders = os.listdir(code_folder)

In [3]:
def preprocess_script(script):
    with open(script, "r", encoding="utf-8") as file:
        lines = file.readlines()
        preproc_lines = list()
        for line in lines:
            if line.lstrip().startswith("#"):
                continue
            line = line.rstrip()
            if "#" in line:
                line = line[:line.index("#")]
            line = line.replace("\n", "")
            line = line.replace("    ", "\t")
            if line == "":
                continue
            preproc_lines.append(line)
        preprocessed_script = "\n".join(preproc_lines)
    return preprocessed_script

preproc_scripts = list()
problem_nums = list()

for problem_folder in tqdm(problem_folders):
    scripts = os.listdir(os.path.join(code_folder, problem_folder))
    problem_num = scripts[0].split("_")[0]
    for script in scripts:
        script_file = os.path.join(code_folder, problem_folder, script)
        preprocessed_script = preprocess_script(script_file)
        
        preproc_scripts.append(preprocessed_script)
    problem_nums.extend([problem_num]*len(scripts))

100%|████████████████████████████████████████| 300/300 [00:02<00:00, 142.85it/s]


In [4]:
import pandas as pd
df = pd.DataFrame(data = {"code":preproc_scripts, "problem_num":problem_nums})

In [5]:
temp_dict = dict()
for code, prob_num in zip(df["code"], df["problem_num"]):
    if prob_num in temp_dict:
        temp_dict[prob_num] += 1
    else:
        temp_dict[prob_num] = 1

print(temp_dict)

{'problem092': 150, 'problem098': 150, 'problem266': 150, 'problem097': 150, 'problem290': 150, 'problem127': 150, 'problem220': 150, 'problem148': 151, 'problem133': 150, 'problem044': 150, 'problem226': 150, 'problem071': 150, 'problem094': 150, 'problem272': 150, 'problem115': 150, 'problem068': 152, 'problem212': 150, 'problem193': 151, 'problem073': 151, 'problem186': 150, 'problem210': 150, 'problem057': 150, 'problem161': 150, 'problem149': 151, 'problem227': 150, 'problem268': 150, 'problem244': 150, 'problem014': 151, 'problem105': 150, 'problem284': 150, 'problem024': 150, 'problem232': 150, 'problem246': 150, 'problem196': 150, 'problem039': 150, 'problem077': 150, 'problem124': 150, 'problem271': 150, 'problem179': 150, 'problem183': 150, 'problem125': 151, 'problem280': 150, 'problem020': 150, 'problem106': 151, 'problem275': 150, 'problem205': 151, 'problem103': 150, 'problem169': 150, 'problem209': 150, 'problem146': 150, 'problem060': 151, 'problem008': 150, 'problem116

In [6]:
len(df)

45101

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
df["tokens"] = df["code"].apply(tokenizer.tokenize)
df["len"] = df["tokens"].apply(len)
df.describe()

Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,len
count,45101.0
mean,160.123789
std,500.930345
min,5.0
25%,61.0
50%,108.0
75%,200.0
max,97566.0


In [8]:
ndf = df[df["len"] <= 512].reset_index(drop=True)
ndf.describe()

Unnamed: 0,len
count,43647.0
mean,137.920842
std,104.933475
min,5.0
25%,60.0
50%,104.0
75%,187.0
max,512.0


In [9]:
data_frame = pd.read_csv("./test.csv")

In [10]:
data_frame["code1_len"] = data_frame["code1"].apply(len)
data_frame["code1_len"].describe()

count    179700.000000
mean        392.408347
std         923.698933
min          20.000000
25%         153.000000
50%         255.000000
75%         489.000000
max      203699.000000
Name: code1_len, dtype: float64

In [11]:
data_frame["code2_len"] = data_frame["code2"].apply(len)
data_frame["code2_len"].describe()

count    179700.000000
mean        390.010367
std        1333.079216
min          15.000000
25%         146.000000
50%         254.000000
75%         477.000000
max      203669.000000
Name: code2_len, dtype: float64

In [12]:
from sklearn.model_selection import train_test_split

train_df, valid_df, train_label, valid_label = train_test_split(
    ndf,
    ndf["problem_num"],
    random_state=42,
    test_size=0.1,
    stratify=ndf["problem_num"],
)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [13]:
from rank_bm25 import BM25Okapi
from itertools import combinations

#### Train negative pair 구성

In [57]:
codes = train_df["code"].to_list()
problems = train_df["problem_num"].unique().tolist()
problems.sort()

tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
bm25 = BM25Okapi(tokenized_corpus)

total_positive_pairs = list()
total_negative_pairs = list()

for problem in tqdm(problems):
    solution_codes = train_df[train_df["problem_num"] == problem]["code"]
    positive_pairs = list(combinations(solution_codes.to_list(), 2))
    
    solution_codes_indices = solution_codes.index.to_list()
    negative_pairs = list()
    
    first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
    negative_code_scores = bm25.get_scores(first_tokenized_code)
    negative_code_ranking = negative_code_scores.argsort()[::-1]
    ranking_idx = 0
    
    for solution_code in solution_codes:
        negative_solutions = list()
        while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
            high_score_idx = negative_code_ranking[ranking_idx]
            
            if high_score_idx not in solution_codes_indices:
                negative_solutions.append(train_df["code"].iloc[high_score_idx])
            ranking_idx += 1
        
        for negative_solution in negative_solutions:
            negative_pairs.append((solution_code, negative_solution))
    
    total_positive_pairs.extend(positive_pairs)
    total_negative_pairs.extend(negative_pairs)

pos_code1 = list(map(lambda x : x[0], total_positive_pairs))
pos_code2 = list(map(lambda x : x[1], total_positive_pairs))

neg_code1 = list(map(lambda x : x[0], total_negative_pairs))
neg_code2 = list(map(lambda x : x[1], total_negative_pairs))

pos_label = [1] * len(pos_code1)
neg_label = [0] * len(neg_code1)

pos_code1.extend(neg_code1)
total_code1 = pos_code1
pos_code2.extend(neg_code2)
total_code2 = pos_code2
pos_label.extend(neg_label)
total_label = pos_label
pair_data = pd.DataFrame(data={
    "code1" : total_code1,
    "code2" : total_code2,
    "similar" : total_label
})
pair_data = pair_data.sample(frac=1).reset_index(drop=True)

pair_data.to_csv("./data/train_data.csv", index=False)

100%|█████████████████████████████████████████| 300/300 [14:45<00:00,  2.95s/it]


#### Validation negative pair 구성

In [58]:
codes = valid_df["code"].to_list()
problems = valid_df["problem_num"].unique().tolist()
problems.sort()

tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
bm25 = BM25Okapi(tokenized_corpus)

total_positive_pairs = list()
total_negative_pairs = list()

for problem in tqdm(problems):
    solution_codes = valid_df[valid_df["problem_num"] == problem]["code"]
    positive_pairs = list(combinations(solution_codes.to_list(), 2))
    
    solution_codes_indices = solution_codes.index.to_list()
    negative_pairs = list()
    
    first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
    negative_code_scores = bm25.get_scores(first_tokenized_code)
    negative_code_ranking = negative_code_scores.argsort()[::-1]
    ranking_idx = 0
    
    for solution_code in solution_codes:
        negative_solutions = list()
        while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
            high_score_idx = negative_code_ranking[ranking_idx]
            
            if high_score_idx not in solution_codes_indices:
                negative_solutions.append(valid_df["code"].iloc[high_score_idx])
            ranking_idx += 1
        
        for negative_solution in negative_solutions:
            negative_pairs.append((solution_code, negative_solution))
    
    total_positive_pairs.extend(positive_pairs)
    total_negative_pairs.extend(negative_pairs)

pos_code1 = list(map(lambda x : x[0], total_positive_pairs))
pos_code2 = list(map(lambda x : x[1], total_positive_pairs))

neg_code1 = list(map(lambda x : x[0], total_negative_pairs))
neg_code2 = list(map(lambda x : x[1], total_negative_pairs))

pos_label = [1] * len(pos_code1)
neg_label = [0] * len(neg_code1)

pos_code1.extend(neg_code1)
total_code1 = pos_code1
pos_code2.extend(neg_code2)
total_code2 = pos_code2
pos_label.extend(neg_label)
total_label = pos_label
pair_data = pd.DataFrame(data={
    "code1" : total_code1,
    "code2" : total_code2,
    "similar" : total_label
})
pair_data = pair_data.sample(frac=1).reset_index(drop=True)

pair_data.to_csv("./data/valid_data.csv", index=False)

100%|█████████████████████████████████████████| 300/300 [00:55<00:00,  5.39it/s]


#### Train Part

In [62]:
import torch
from transformers import AutoModel, RobertaForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModel.from_pretrained("microsoft/graphcodebert-base")
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
model.to(device)

loading configuration file https://huggingface.co/microsoft/graphcodebert-base/resolve/main/config.json from cache at /home/piai/.cache/huggingface/transformers/8edef9fb59cf1f2670191d673b13a719a79361a2ae12cc806f942649b8b90db8.62db6c94b05689b7cb238a1a38840e19d1014fc755a9e328ab74a6c672db2d3d
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weight

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [None]:
# dir(model)

In [63]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
import numpy as np
from datasets import load_dataset, load_metric

In [16]:
MODEL = "microsoft/graphcodebert-base"
INPUT = "./data/train_data.csv"
VAL_INPUT = "./data/valid_data.csv"
MAX_LEN = 512

dataset = load_dataset("csv", data_files=INPUT)['train']
val_dataset = load_dataset("csv", data_files=VAL_INPUT)["train"]
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

dataset = dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])
val_dataset = val_dataset.map(example_fn, remove_columns=["code1", "code2", "similar"])
    
# model = RobertaForSequenceClassification.from_pretrained(MODEL) # RobertaForSequenceClassification 는 BertForSequenceClassification 와 달리 pooler가 없는게 기본이기 때문에 문장 유사도에 사용 가능.


Using custom data configuration default-ce35a410e25f9cbd
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-ce35a410e25f9cbd/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-cc9a8ee57a12244a
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-cc9a8ee57a12244a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5133767 [00:00<?, ?ex/s]

  0%|          | 0/59389 [00:00<?, ?ex/s]

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [212]:
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

args = TrainingArguments(
    './runs/',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    do_train=True,
    do_eval=True,
    #save_strategy="epoch",
    save_strategy="steps",
    #logging_strategy="epoch",
    logging_strategy="steps",
    #evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=2e-5,
    #metric_for_best_model= "f1",
    load_best_model_at_end=True,
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics= metric_fn,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [213]:
# gpu cashe clear
import gc
gc.collect()
torch.cuda.empty_cache()

trainer.train()

***** Running training *****
  Num examples = 5133767
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1925163


Step,Training Loss,Validation Loss,Accuracy
500,0.0405,0.861998,0.908367
1000,0.2728,0.401158,0.896631
1500,0.2131,0.296671,0.92369
2000,0.2003,0.245116,0.93253
2500,0.1975,0.259828,0.938659
3000,0.1689,0.271056,0.931267


***** Running Evaluation *****
  Num examples = 59389
  Batch size = 16
Saving model checkpoint to ./runs/checkpoint-500
Configuration saved in ./runs/checkpoint-500/config.json
Model weights saved in ./runs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./runs/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./runs/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59389
  Batch size = 16
Saving model checkpoint to ./runs/checkpoint-1000
Configuration saved in ./runs/checkpoint-1000/config.json
Model weights saved in ./runs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./runs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./runs/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59389
  Batch size = 16
Saving model checkpoint to ./runs/checkpoint-1500
Configuration saved in ./runs/checkpoint-1500/config.json
Model weights saved in ./runs/che

TrainOutput(global_step=3000, training_loss=0.18219283294677735, metrics={'train_runtime': 5153.5255, 'train_samples_per_second': 2988.498, 'train_steps_per_second': 373.562, 'total_flos': 6054395050455840.0, 'train_loss': 0.18219283294677735, 'epoch': 0.0})

In [214]:
import pandas as pd

TEST = "./data/test.csv"
SUB = "./data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)["train"]
test_dataset = test_dataset.map(example_fn, remove_columns=["code1", "code2"])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df["similar"] = np.argmax(predictions.predictions, axis=-1)
df.to_csv("./submissions/submission.csv", index=False)



Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-e82830c223845fb2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-e82830c223845fb2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/179700 [00:00<?, ?ex/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 16
