In [19]:
from google.colab import drive
drive.mount("/gdrive")
%cd /gdrive/MyDrive/dacon/dacon_code_similarity

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/dacon/dacon_code_similarity


In [None]:
#!unzip data/open.zip -d data/
!pip install transformers datasets

In [4]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric

MODEL = "klue/roberta-base"
INPUT = "data/sample_train.csv"
MAX_LEN = 256
dataset = load_dataset("csv", data_files=INPUT)['train']
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

dataset = dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])
dataset = dataset.train_test_split(0.1)

_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output
    
model = RobertaForSequenceClassification.from_pretrained(MODEL) # RobertaForSequenceClassification 는 BertForSequenceClassification 와 달리 pooler가 없는게 기본이기 때문에 문장 유사도에 사용 가능.

args = TrainingArguments(
    'runs/',
    per_device_train_batch_size=32,
    num_train_epochs=10,
    do_train=True,
    do_eval=True,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=metric_fn)

trainer.train()

Using custom data configuration default-45f4fb0b2a7ce81d
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-45f4fb0b2a7ce81d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-45f4fb0b2a7ce81d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-45cb500a9aa956ac.arrow
Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3944,0.279758,0.90039
2,0.1907,0.256358,0.915971
3,0.1125,0.119039,0.962159
4,0.0721,0.17072,0.963272
5,0.0413,0.119727,0.966055
6,0.0256,0.18196,0.966611
7,0.0176,0.124562,0.973845
8,0.0104,0.100349,0.979967
9,0.0051,0.112896,0.982749
10,0.0026,0.116165,0.982749


***** Running Evaluation *****
  Num examples = 1797
  Batch size = 8
Saving model checkpoint to runs/checkpoint-506
Configuration saved in runs/checkpoint-506/config.json
Model weights saved in runs/checkpoint-506/pytorch_model.bin
tokenizer config file saved in runs/checkpoint-506/tokenizer_config.json
Special tokens file saved in runs/checkpoint-506/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1797
  Batch size = 8
Saving model checkpoint to runs/checkpoint-1012
Configuration saved in runs/checkpoint-1012/config.json
Model weights saved in runs/checkpoint-1012/pytorch_model.bin
tokenizer config file saved in runs/checkpoint-1012/tokenizer_config.json
Special tokens file saved in runs/checkpoint-1012/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1797
  Batch size = 8
Saving model checkpoint to runs/checkpoint-1518
Configuration saved in runs/checkpoint-1518/config.json
Model weights saved in runs/checkpoint-1518/pytorch_model.bin
t

TrainOutput(global_step=5060, training_loss=0.08721778126573373, metrics={'train_runtime': 2424.8582, 'train_samples_per_second': 66.697, 'train_steps_per_second': 2.087, 'total_flos': 2.12764754916864e+16, 'train_loss': 0.08721778126573373, 'epoch': 10.0})

In [13]:
import pandas as pd

TEST = "data/test.csv"
SUB = "data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)['train']
test_dataset = test_dataset.map(example_fn, remove_columns=['code1', 'code2'])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df['similar'] = np.argmax(predictions.predictions, axis=-1)
df.to_csv('./submissions/submission.csv', index=False)

Using custom data configuration default-119b339a9ccf4b98
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-119b339a9ccf4b98/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['pair_id', 'code1', 'code2'],
    num_rows: 179700
})


  0%|          | 0/179700 [00:00<?, ?ex/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 8
