<a href="https://colab.research.google.com/github/iljf/NLU_project_team1/blob/main/KLUE_roberta_base_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KLUE/Roberta-base BASELINE MODEL

In [None]:
!pip install sentence-transformers datasets

In [None]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [None]:
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [None]:
model_name = "klue/roberta-base"

In [None]:
train_batch_size = 32
num_epochs = 4
model_save_path = "output/training_klue_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
embedding_model = models.Transformer(model_name)

In [None]:
pooler = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

In [None]:
model = SentenceTransformer(modules=[embedding_model, pooler])

In [None]:
datasets = load_dataset("klue", "sts")

In [None]:
datasets.keys()

In [None]:
datasets["train"][0]

In [None]:
testsets = load_dataset("kor_nlu", "sts")

In [None]:
testsets.keys()

In [None]:
testsets["test"][0]

In [None]:
train_samples = []
dev_samples = []
test_samples = []

for phase in ["train", "validation"]:
    examples = datasets[phase]

    for example in examples:
        score = float(example["labels"]["label"]) / 5.0  

        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]], 
            label=score,
        )

        if phase == "validation":
            dev_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

for example in testsets["test"]:
    score = float(example["score"]) / 5.0

    if example["sentence1"] and example["sentence2"]:
        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]],
            label=score,
        )

    test_samples.append(inp_example)

In [None]:
train_samples[0].texts, train_samples[0].label

In [None]:
test_samples[0].texts, test_samples[0].label

In [None]:
train_dataloader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=32,
)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    dev_samples,
    name="sts-dev",
)

In [None]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.3)  # 10% of train data for warm-up
logging.info(f"Warmup-steps: {warmup_steps}")

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

In [None]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')

In [None]:
test_evaluator(model, output_path=model_save_path)