In [3]:
!pip install sentence-transformers==2.7.0 datasets==2.19.0 faiss-cpu==1.8.0 -qqq

In [4]:
from sentence_transformers import SentenceTransformer, models
transformer_model = models.Transformer('klue/roberta-base')

pooling_layer = models.Pooling(
    transformer_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)
embedding_model = SentenceTransformer(modules=[transformer_model, pooling_layer])

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from datasets import load_dataset
klue_sts_train = load_dataset('klue', 'sts', split='train')
klue_sts_test  = load_dataset('klue', 'sts', split='validation')
klue_sts_train[0]

Downloading readme: 22.5kB [00:00, 11.0MB/s]
Downloading data: 100%|██████████| 1.52M/1.52M [00:00<00:00, 2.37MB/s]
Downloading data: 100%|██████████| 68.8k/68.8k [00:00<00:00, 154kB/s]
Generating train split: 100%|██████████| 11668/11668 [00:00<00:00, 524900.94 examples/s]
Generating validation split: 100%|██████████| 519/519 [00:00<00:00, 187158.78 examples/s]


{'guid': 'klue-sts-v1_train_00000',
 'source': 'airbnb-rtt',
 'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.',
 'sentence2': '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.',
 'labels': {'label': 3.7, 'real-label': 3.714285714285714, 'binary-label': 1}}

In [6]:
# 학습 데이터셋의 10%를 검증 데이터셋으로 구성한다.
klue_sts_train = klue_sts_train.train_test_split(test_size=0.1, seed=42)
klue_sts_train, klue_sts_eval = klue_sts_train['train'], klue_sts_train['test']

In [7]:
from sentence_transformers import InputExample

# 유사도 점수를 0~1 사이로 정규화하고 InputExample 객체에 담는다.
def prepare_sts_examples(dataset):
    examples = []
    for data in dataset:
        examples.append(
            InputExample(
                texts = [data['sentence1'], data['sentence2']],
                label = data['labels']['label'] / 5.0
            )
        )
    return examples

In [8]:
train_examples = prepare_sts_examples(klue_sts_train)
eval_examples  = prepare_sts_examples(klue_sts_eval)
test_examples  = prepare_sts_examples(klue_sts_test)

In [9]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [10]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

eval_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_examples)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

In [11]:
test_evaluator(embedding_model)

np.float64(0.36460670798564826)

In [12]:
# 유사한 문장 데이터로 임베딩 모델 학습하기

from sentence_transformers import losses

num_epochs = 4
model_name = 'klue/roberta-base'
model_save_path = 'output/training_sts_' + model_name.replace("/", "-")
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# 임베딩 모델 학습
embedding_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=eval_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=100,
    output_path=model_save_path
)

Iteration:   0%|          | 0/657 [00:03<?, ?it/s]
Epoch:   0%|          | 0/4 [00:03<?, ?it/s]


KeyboardInterrupt: 