In [1]:
!nvidia-smi

Thu Jun  9 14:50:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
%%capture
from google.colab import drive
drive.mount('/content/drive')

In [3]:
%%capture
!pip install transformers datasets 

---

In [4]:
import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment',  None)
import gc
import torch
from datasets import Dataset, list_metrics, load_from_disk, load_metric, concatenate_datasets
from transformers import (
    AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, RobertaForSequenceClassification, 
    AdamW, get_cosine_schedule_with_warmup
    )


In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

MODEL = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL, do_lowercase=True)

%cd /content/drive/MyDrive/Dacon/Code_Similarity/data
%ls

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

/content/drive/MyDrive/Dacon/Code_Similarity/data
[0m[01;34mckpt[0m/                  submission.csv  [01;34mtokenized_test[0m/   train.pkl
code.zip               test.csv        [01;34mtokenized_train[0m/  val.pkl
sample_submission.csv  test.pkl        [01;34mtokenized_val[0m/


In [6]:
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

In [7]:
train = pd.read_pickle('train.pkl')
val = pd.read_pickle('val.pkl')
test = pd.read_pickle('test.pkl')
sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
train.head(3)

Unnamed: 0,code1,code2,label
0,i = 1\nwhile 1:\n\tx = int(raw_input())\n\tif ...,"n = int(input())\ndic = {}\nfor i in range(1,n...",0
1,"a,b = map(int,raw_input().split())\nprint a/b\...","n,k = map(int,input().split())\na = list(map(i...",0
2,s=input()\nk=int(input())\ns_double = s*2\nn =...,s = list(input())\nk = int(input())\nn=len(s)...,1


In [None]:
val.head(3)

Unnamed: 0,code1,code2,label
0,"n,k=map(int,input().split())\nr,s,p=map(int,in...","n, a, b = input().split(' ')\nn = int(n)\na = ...",0
1,"n = int(input())\nnumbers = list(map(int, inpu...","n ,a, b = map(int,input().split())\nmod = pow(...",0
2,"nml = input().split()\nn, m, l = map(int, nml)...","n,m,l=map(int,input().split())\na=[]\nb=[]\nfo...",1


In [None]:
test.head(3)

Unnamed: 0,code1,code2
0,def main():\n s = input()\n if s.count('a') ...,"n,k = map(int,input().split())\na = list(map(i..."
1,"n,k,q = map(int,input().split())\npoints = [0]...","n, k, q = map(int,input().split())\nif k > q:\..."
2,n = int(input())\nlst = [(i+1) for i in range(...,s = input()\nt = input()\nlength_s = len(s)\nl...


In [None]:
sample_submission.head(3)

Unnamed: 0,pair_id,similar
0,1,-1
1,2,-1
2,3,-1


---

In [8]:
MAX_LEN = 512

def tokenizing(data):
    outputs = tokenizer(data['code1'], data['code2'], padding=True, max_length=MAX_LEN, truncation=True)
    if 'similar' in data:
        outputs["labels"] = data["label"]

    return outputs

In [None]:
ds_train = Dataset.from_pandas(train)
ds_val = Dataset.from_pandas(val)
ds_test = Dataset.from_pandas(test)

ds_train = ds_train.map(tokenizing, remove_columns=['code1', 'code2'])
ds_val = ds_val.map(tokenizing, remove_columns=['code1', 'code2'])
ds_test = ds_test.map(tokenizing, remove_columns=['code1', 'code2'])

ds_train.save_to_disk("tokenized_train")
ds_val.save_to_disk("tokenized_val")
ds_test.save_to_disk("tokenized_test")

  0%|          | 0/150000 [00:00<?, ?ex/s]

  0%|          | 0/30000 [00:00<?, ?ex/s]

  0%|          | 0/179700 [00:00<?, ?ex/s]

---

In [9]:
ds_train = load_from_disk("tokenized_train")
ds_val = load_from_disk("tokenized_val")
ds_test = load_from_disk("tokenized_test")

In [10]:
model = RobertaForSequenceClassification.from_pretrained(MODEL, 
                                                         num_labels=2,
                                                         attention_probs_dropout_prob = 0.12,
                                                         hidden_dropout_prob = 0.12,
                                                         classifier_dropout = 0.05,
                                                         )

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

In [11]:
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric('accuracy')

def METRIC(p):
    logits, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(logits, axis=-1))
    return output

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [12]:
total_samples = len(ds_train)
batch_size = 16
n_epochs = 6
num_warmup_steps = (total_samples // batch_size) * 2
num_total_steps = (total_samples // batch_size) * n_epochs

opt = AdamW(model.parameters(), lr=5e-5, weight_decay=0.1, no_deprecation_warning=True)
sch = get_cosine_schedule_with_warmup(optimizer=opt, 
                                      num_warmup_steps=num_warmup_steps, 
                                      num_training_steps=num_total_steps)

args = TrainingArguments(
    'ckpt/',
    do_eval=True,
    do_train=True,
    load_best_model_at_end = True,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs = n_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    label_smoothing_factor=0.1
)

Trained_Model = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        tokenizer=tokenizer,
        optimizers=(opt, sch),
        compute_metrics=METRIC,
        )

In [13]:
# Trained_Model.train()
Trained_Model.train(resume_from_checkpoint=True)

***** Running training *****
  Num examples = 150000
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 56250


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3738,0.258226,0.969033
2,0.2606,0.268831,0.9696
3,0.2551,0.235905,0.981267
4,0.2323,0.224476,0.987667
5,0.2134,0.216521,0.991067
6,0.2043,0.213595,0.993367


***** Running Evaluation *****
  Num examples = 30000
  Batch size = 16
Saving model checkpoint to ckpt/checkpoint-9375
Configuration saved in ckpt/checkpoint-9375/config.json
Model weights saved in ckpt/checkpoint-9375/pytorch_model.bin
tokenizer config file saved in ckpt/checkpoint-9375/tokenizer_config.json
Special tokens file saved in ckpt/checkpoint-9375/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 16
Saving model checkpoint to ckpt/checkpoint-18750
Configuration saved in ckpt/checkpoint-18750/config.json
Model weights saved in ckpt/checkpoint-18750/pytorch_model.bin
tokenizer config file saved in ckpt/checkpoint-18750/tokenizer_config.json
Special tokens file saved in ckpt/checkpoint-18750/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 16
Saving model checkpoint to ckpt/checkpoint-28125
Configuration saved in ckpt/checkpoint-28125/config.json
Model weights saved in ckpt/checkpoint-28125/

TrainOutput(global_step=56250, training_loss=0.2565738888888889, metrics={'train_runtime': 49164.2459, 'train_samples_per_second': 18.306, 'train_steps_per_second': 1.144, 'total_flos': 2.344916025353424e+17, 'train_loss': 0.2565738888888889, 'epoch': 6.0})

In [14]:
predictions = Trained_Model.predict(ds_test)

***** Running Prediction *****
  Num examples = 179700
  Batch size = 16


In [15]:
sample_submission['similar'] = np.argmax(predictions.predictions, axis=-1)
sample_submission.to_csv('submission.csv', index=False)