In [1]:
import numpy as np
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_metric, load_dataset, load_from_disk
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer

In [2]:
MODEL = "microsoft/graphcodebert-base"
TRAIN_INPUT = "./data/train_data.csv"
VALID_INPUT = "./data/valid_data.csv"
MAX_LEN = 512

train_dataset = load_dataset("csv", data_files=TRAIN_INPUT)['train']
valid_dataset = load_dataset("csv", data_files=VALID_INPUT)["train"]
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.truncation_side='left'

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True,)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

train_dataset = train_dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])
valid_dataset = valid_dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])

Using custom data configuration default-42082c856f8fca10


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-42082c856f8fca10/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-42082c856f8fca10/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-8eafd8204a1f3d6a


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-8eafd8204a1f3d6a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-8eafd8204a1f3d6a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5133767 [00:00<?, ?ex/s]

  0%|          | 0/59389 [00:00<?, ?ex/s]

In [3]:
MODEL = "microsoft/graphcodebert-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(MODEL)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.truncation_side = 'left'

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier

In [4]:
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

In [5]:
def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

In [6]:
args = TrainingArguments(
    output_dir='./models/',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    disable_tqdm = False,
    do_train=True,
    do_eval=True,
    save_strategy="steps",
    logging_strategy="steps",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=1e-5,
    optim='adamw_torch',
    # metric_for_best_model= "f1",
    save_total_limit=5,
    load_best_model_at_end=True,
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics= metric_fn,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
)

In [7]:
import gc
from knockknock import discord_sender

webhook_url='https://discord.com/api/webhooks/981021972697858078/cKpZXsyxyFGptLsMiFfWdEbjwavkO0qgkgWGW3fyYeBxMkJFebDq9U5M4vgDibgM3Ew6'

@discord_sender(webhook_url=webhook_url)
def do_train():
    gc.collect()
    torch.cuda.empty_cache()
    trainer.train()

In [8]:
do_train()

***** Running training *****
  Num examples = 5133767
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 481290


Step,Training Loss,Validation Loss,Accuracy
500,0.3684,0.313474,0.881409
1000,0.1801,0.275722,0.905471
1500,0.1533,0.184461,0.934011
2000,0.1229,0.199417,0.930728
2500,0.1136,0.309453,0.917476
3000,0.1076,0.155413,0.951843
3500,0.0953,0.18674,0.947061
4000,0.0876,0.146518,0.956187
4500,0.0951,0.140978,0.961138
5000,0.0854,0.141808,0.960346


***** Running Evaluation *****
  Num examples = 59389
  Batch size = 64
Saving model checkpoint to ./models/checkpoint-500
Configuration saved in ./models/checkpoint-500/config.json
Model weights saved in ./models/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./models/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59389
  Batch size = 64
Saving model checkpoint to ./models/checkpoint-1000
Configuration saved in ./models/checkpoint-1000/config.json
Model weights saved in ./models/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./models/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59389
  Batch size = 64
Saving model checkpoint to ./models/checkpoint-1500
Configuration saved in ./models/checkpoint-1500/config.json
Model wei

In [9]:
MAX_LEN = 512
def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

In [10]:
import gc
gc.collect()
torch.cuda.empty_cache()

import pandas as pd

TEST = "./data/test.csv"
SUB = "./data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)["train"]
test_dataset = test_dataset.map(example_fn, remove_columns=["code1", "code2"])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df["similar"] = np.argmax(predictions.predictions, axis=-1)
df.to_csv("./submissions/submission.csv", index=False)

Using custom data configuration default-034bfdc908dab38a
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-034bfdc908dab38a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/piai/.cache/huggingface/datasets/csv/default-034bfdc908dab38a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-32e57adf0ac1f496.arrow
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 64


In [11]:
import gc
gc.collect()
torch.cuda.empty_cache()