In [1]:
import numpy as np
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_metric, load_dataset, load_from_disk
import torch
from transformers import RobertaForSequenceClassification, AutoTokenizer

In [2]:
train_dataset = load_from_disk('./data/train_dataset_lv2')
valid_dataset = load_from_disk('./data/valid_dataset_lv2')

In [3]:
MODEL = "microsoft/graphcodebert-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained("./models/checkpoint-8000")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.truncation_side = 'left'

In [4]:
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

In [5]:
def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

In [6]:
args = TrainingArguments(
    output_dir='./models/',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    disable_tqdm = False,
    do_train=True,
    do_eval=True,
    save_strategy="steps",
    logging_strategy="steps",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=1e-5,
    optim='adamw_torch',
    # metric_for_best_model= "f1",
    save_total_limit=5,
    load_best_model_at_end=True,
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics= metric_fn,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
)

In [7]:
import gc
from knockknock import discord_sender

webhook_url='https://discord.com/api/webhooks/981021972697858078/cKpZXsyxyFGptLsMiFfWdEbjwavkO0qgkgWGW3fyYeBxMkJFebDq9U5M4vgDibgM3Ew6'

@discord_sender(webhook_url=webhook_url)
def do_train():
    gc.collect()
    torch.cuda.empty_cache()
    trainer.train()

In [8]:
do_train()

***** Running training *****
  Num examples = 90000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 8436


Step,Training Loss,Validation Loss,Accuracy
500,0.4743,2.927912,0.504889
1000,0.0869,4.304398,0.517667
1500,0.0509,4.695354,0.509889
2000,0.0371,4.95265,0.492667
2500,0.0276,4.749534,0.496556
3000,0.0237,4.936773,0.495111
3500,0.0171,5.043801,0.503667
4000,0.0208,5.294828,0.497444
4500,0.0155,5.073262,0.504333
5000,0.0152,5.143164,0.498667


***** Running Evaluation *****
  Num examples = 9000
  Batch size = 64
Saving model checkpoint to ./models/checkpoint-500
Configuration saved in ./models/checkpoint-500/config.json
Model weights saved in ./models/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./models/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 9000
  Batch size = 64
Saving model checkpoint to ./models/checkpoint-1000
Configuration saved in ./models/checkpoint-1000/config.json
Model weights saved in ./models/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./models/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 9000
  Batch size = 64
Saving model checkpoint to ./models/checkpoint-1500
Configuration saved in ./models/checkpoint-1500/config.json
Model weight

In [9]:
MAX_LEN = 512
def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

In [10]:
import gc
gc.collect()
torch.cuda.empty_cache()

import pandas as pd

TEST = "./data/test.csv"
SUB = "./data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)["train"]
test_dataset = test_dataset.map(example_fn, remove_columns=["code1", "code2"])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df["similar"] = np.argmax(predictions.predictions, axis=-1)
df.to_csv("./submissions/submission_lv2.csv", index=False)

Using custom data configuration default-034bfdc908dab38a
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-034bfdc908dab38a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/piai/.cache/huggingface/datasets/csv/default-034bfdc908dab38a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-32e57adf0ac1f496.arrow
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 64
