# KLUE 의미 유사도 (STS)
STS(Semantic Textual Similarity) 의 목표는 입력으로 주어진 두 문장간의 의미 동등성을 수치로 표현하는 것

KLUE Benchmark 링크 : 
[https://klue-benchmark.com](https://klue-benchmark.com)


* Input : 2개의 문장
* Output : 두 문장의 유사도

전형적인 Regression 문제

데이터 구조
- id (string) : ID
- source (string)
- sentence1 (string)
- sentence2 (string)
    - labels (dict)
        - label: round by real_label
        - real-label [0~5]
        - binary-label [0,1]



# 필요 라이브러리 Import

In [1]:
!pip install scikit-learn
!pip install transformers
!pip install datasets
!pip install scikit-learn
!pip install xgboost
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 29.3 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 67.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 44.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading

In [2]:
from datasets import load_dataset
import numpy as np
import xgboost as xgb
from catboost import CatBoostRegressor, Pool
from transformers import BertTokenizerFast
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
import os

# Data 확인하기

In [3]:
data = load_dataset('klue', 'sts')
data



Downloading builder script:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

Downloading and preparing dataset klue/sts (download: 1.29 MiB, generated: 2.82 MiB, post-processed: Unknown size, total: 4.11 MiB) to /root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e...


Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

Dataset klue downloaded and prepared to /root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'],
        num_rows: 11668
    })
    validation: Dataset({
        features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'],
        num_rows: 519
    })
})

## binary-label 을 label로 변경

In [4]:
data = data.flatten()
data = data.rename_column('labels.binary-label','label')
data

DatasetDict({
    train: Dataset({
        features: ['guid', 'source', 'sentence1', 'sentence2', 'labels.label', 'labels.real-label', 'label'],
        num_rows: 11668
    })
    validation: Dataset({
        features: ['guid', 'source', 'sentence1', 'sentence2', 'labels.label', 'labels.real-label', 'label'],
        num_rows: 519
    })
})

In [5]:
train_data = data['train']
train_data

Dataset({
    features: ['guid', 'source', 'sentence1', 'sentence2', 'labels.label', 'labels.real-label', 'label'],
    num_rows: 11668
})

# 데이터를 모델이 입력하기 위해 다운로드하고 변환

In [6]:
def prepare_data():
    # set tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('klue/bert-base')
    # download dataset 
    klue_df = load_dataset('klue', 'sts')

    # make binary-label to label
    klue_df = klue_df.flatten()
    klue_df = klue_df.rename_column('labels.binary-label','label')

    # dataset이 train과 validation
    train_valid_df = klue_df['train']
    test_df = klue_df['validation']

    # train 90%, test 10%
    train_valid = train_valid_df.train_test_split(train_size=0.2, shuffle=False)
    train_df, valid_df = train_valid['train'], train_valid['test']

    # set data to numpy format
    train_input, train_label = data_np(train_df, tokenizer)
    valid_input, valid_label = data_np(valid_df, tokenizer)
    test_input, test_label = data_np(test_df, tokenizer)

    return train_input, valid_input, test_input, train_label, valid_label, test_label

In [7]:
def data_np(df, tokenizer):
    # sentence1 tokenizing
    data1 = tokenizer(
            df['sentence1'],
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            max_length=128
        )
    sentence1 = data1['input_ids'].numpy()

    # sentence2 tokenizing
    data2 = tokenizer(
            df['sentence2'],
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            max_length=128
        )
    sentence2 = data2['input_ids'].numpy()

    # count commun token
    communs = []
    for i in range(len(sentence1)):
        sent1 = np.delete(sentence1[i],[0])
        sent2 = np.delete(sentence2[i],[0])
        commun = len(np.intersect1d(sent1,sent2))
        communs.append(commun)

    # stack and change to Numpy format Dataset
    input = np.concatenate((sentence1, sentence2), axis=1)
    input = np.column_stack((input, communs))
    labels = df['label']
    return input, labels

# XGBoost 모델 선언 & 학습

In [8]:
# prepare data to [train, validation, test] sets
train_input, valid_input, test_input, train_label, valid_label, test_label = prepare_data()

# data preprocessing for XGBoost
train_data = xgb.DMatrix(train_input, label=train_label)
valid_data = xgb.DMatrix(valid_input, label=valid_label)
data_list = [(train_data, 'train'), (valid_data, 'valid')]

# setting parameters
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse'

# train XGBoost model
bst = xgb.train(params, train_data, num_boost_round = 1000, evals = data_list, early_stopping_rounds=100)

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]



  0%|          | 0/2 [00:00<?, ?it/s]

[0]	train-rmse:0.406091	valid-rmse:0.421577
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[1]	train-rmse:0.347425	valid-rmse:0.374767
[2]	train-rmse:0.307847	valid-rmse:0.347311
[3]	train-rmse:0.281438	valid-rmse:0.332646
[4]	train-rmse:0.260773	valid-rmse:0.322903
[5]	train-rmse:0.24643	valid-rmse:0.316664
[6]	train-rmse:0.235725	valid-rmse:0.312189
[7]	train-rmse:0.227219	valid-rmse:0.309217
[8]	train-rmse:0.217476	valid-rmse:0.307922
[9]	train-rmse:0.213536	valid-rmse:0.307128
[10]	train-rmse:0.206866	valid-rmse:0.306538
[11]	train-rmse:0.195867	valid-rmse:0.306588
[12]	train-rmse:0.190422	valid-rmse:0.307089
[13]	train-rmse:0.187696	valid-rmse:0.30637
[14]	train-rmse:0.182508	valid-rmse:0.306565
[15]	train-rmse:0.180064	valid-rmse:0.306782
[16]	train-rmse:0.172995	valid-rmse:0.307479
[17]	train-rmse:0.168491	valid-rmse:0.30744
[18]	train-rmse:0.163581	valid-rmse:0.307618
[19]	train-r

## XGBoost 모델 평가

In [9]:
# 모델 평가 함수
def evaluate(predict, label):
    # evaluate prediction
    mae = mean_absolute_error(label, predict) 
    mse = mean_squared_error(label, predict)
    # set threshold for evaluate Accuracy
    preds = np.where(predict >= 0.5, 1, 0)
    acc = accuracy_score(label,preds)
    print('MAE:', mae,'\tMSE: ', mse, '\tAccuracy: ', acc)

In [10]:
# predict test data and evaluate
test_data = xgb.DMatrix(test_input)
test_predict = bst.predict(test_data)
evaluate(test_predict, test_label)

MAE: 0.6116279082628691 	MSE:  0.5366507366796779 	Accuracy:  0.394990366088632


# CatBoost 모델 선언 & 학습

In [11]:
# prepare data to [train, validation, test] sets
train_input, valid_input, test_input, train_label, valid_label, test_label = prepare_data()

# data preprocessing for catBoost
train_pool = Pool(train_input, label=train_label)
valid_pood = Pool(valid_input, label=valid_label)

# setting parameters
model = CatBoostRegressor(  iterations=1000,
                            metric_period=100,
                            early_stopping_rounds=100
                          )
                        
# train catBoost model
model.fit(train_pool, eval_set=valid_pood)



  0%|          | 0/2 [00:00<?, ?it/s]

Learning rate set to 0.058165
0:	learn: 0.4876025	test: 0.4876282	best: 0.4876282 (0)	total: 67.4ms	remaining: 1m 7s




100:	learn: 0.2835484	test: 0.3044730	best: 0.3044730 (100)	total: 1.7s	remaining: 15.2s
200:	learn: 0.2496656	test: 0.3015058	best: 0.3011463 (182)	total: 3.37s	remaining: 13.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3011462737
bestIteration = 182

Shrink model to first 183 iterations.


<catboost.core.CatBoostRegressor at 0x7f37cc109d90>

## CatBoost 모델 평가

In [12]:
# predict test data and evaluate
test_pool = Pool(test_input)
test_predict = model.predict(test_pool)
evaluate(test_predict, test_label)

MAE: 0.5917366078050185 	MSE:  0.42034128231233375 	Accuracy:  0.3371868978805395
