In [11]:
#======================================================================================================
# sentence-bert 훈련 및 평가 예시
# => sentence-transformers 패키지를 이용하여 구현 함.(*pip install -U sentence-transformers 설치 필요)
#
# 도큐먼트 : https://www.sbert.net/index.html
# 소스참고 : https://github.com/BM-K/KoSentenceBERT-ETRI

# pip install -U sentence-transformers
#======================================================================================================

from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import sys
import os
import gzip
import csv

from myutils import seed_everything, GPU_info, mlogging

logger = mlogging(loggername="s-bert", logfilname="s-bert")
device = GPU_info()
seed_everything(111)

logfilepath:s-bert_2022-03-03.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30


In [37]:
model_name = "model/bmc_fpt_kowiki20200920.train_model_0225"
#model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = "model/bmc_fpt_kowiki20200920.train_model_0225-s-bert-nli-0303"
train_file = 'korpora/kornli/snli_1.0_train.ko.tsv'
eval_file = 'korpora/korsts/tune_dev.tsv'
test_file = 'korpora/korsts/tune_test.tsv'

train_batch_size = 16
num_epochs = 1

# 모델과 tokenizer 를 불러옴
# => **사전파일(vocab.txt, *.json) 와 model 경로(config.json, pytorch_model.bin)가 같은 경로에 있어야 함.
word_embedding_model = models.Transformer(model_name, max_seq_length=128)

Some weights of the model checkpoint at model/bmc_fpt_kowiki20200920.train_model_0225 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at model/bmc_fpt_kowiki20200920.train_model_0225 and are newly initialized: ['bert

In [18]:
word_embedding_model

Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 

In [19]:
# 2 bert 모델의 임베딩 풀링 정책을 설정(cls 이용, 워드임베딩 평균이용, 워드임베딩 max 이용)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),  #모델이 dimension(768)
                               pooling_mode_mean_tokens=True,  # 워드 임베딩 평균을 이용
                               pooling_mode_cls_token=False,   # cls 를 이용
                               pooling_mode_max_tokens=False)  # 워드 임베딩 값중 max 값을 이용

In [20]:
pooling_model

Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})

In [21]:
# SBERT 모델 생성
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [22]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [25]:
# 훈련 데이터 불러오기
# => [sentence1, sentence2], labels 식으로 만듬
logger.info("Read AllNLI train dataset")

label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []

with open(train_file, "rt", encoding="utf-8") as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, label = line.split('\t')
        label = label2int[label.strip()]
        train_samples.append(InputExample(texts=[s1, s2], label=label))

2022-03-03 15:25:16,298 - s-bert - INFO - Read AllNLI train dataset


In [26]:
# 데이터 셋, 데이터 로더, 손실함수 정의
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, 
                                sentence_embedding_dimension=model.get_sentence_embedding_dimension(), 
                                num_labels=len(label2int))

In [30]:
#Read STSbenchmark dataset and use it as development set
# 평가데이터 불러오기
#korsts 파일로 두 문장간 유사도를 수치로(5.0이 만점=매우 유사) 측정함.
logger.info("Read STSbenchmark dev dataset")
dev_samples = []

with open(eval_file, 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
        dev_samples.append(InputExample(texts= [s1,s2], label=score))

# 2개의 bert 모델에서 구한 2개의 embedding 값들의 cosine 유사도를 구해서, 이를 실제 score와 비교해서 유사도 측정함
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, 
                                                                 batch_size=train_batch_size, 
                                                                 name='sts-dev')

2022-03-03 15:29:15,338 - s-bert - INFO - Read STSbenchmark dev dataset


In [32]:

warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )


2022-03-03 15:30:38,966 - s-bert - INFO - Warmup-steps: 3439


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/34385 [00:00<?, ?it/s]

In [39]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
# => 훈련되어서 저장된 s-bert 모델을 불러와서 성능 평가 해봄
##############################################################################

test_samples = []
with open(test_file, 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts=[s1,s2], label=score))

print("\n")
print("======================TEST===================")
print("\n\n")
model = SentenceTransformer(model_save_path)
print(f"model save path > {model_save_path}")
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test', show_progress_bar=True)
test_evaluator(model, output_path=model_save_path)






model save path > model/bmc_fpt_kowiki20200920.train_model_0225-s-bert-nli-0303


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Batches:   0%|          | 0/87 [00:00<?, ?it/s]

0.7059602810337253