# KLUE 의미 유사도 (STS)
STS(Semantic Textual Similarity) 의 목표는 입력으로 주어진 두 문장간의 의미 동등성을 수치로 표현하는 것

KLUE Benchmark 링크 : 
[https://klue-benchmark.com](https://klue-benchmark.com)


* Input : 1개의 문장
* Output : 두 문장의 유사도

전형적인 Classification 문제

데이터 구조
- id (string) : ID
- title (string)
- label (string)
- url (string): source lint
- data (date): publish date



# 필요 라이브러리 Import

In [1]:
!pip install scikit-learn
!pip install transformers
!pip install datasets
!pip install scikit-learn
!pip install xgboost
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.6 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 52.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading 

In [4]:
from datasets import load_dataset
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier, Pool, metrics
from transformers import BertTokenizerFast
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score
import os

# Data 확인하기

In [3]:
data = load_dataset('klue', 'ynat')
data



Downloading builder script:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

Downloading and preparing dataset klue/ynat (download: 4.70 MiB, generated: 11.59 MiB, post-processed: Unknown size, total: 16.29 MiB) to /root/.cache/huggingface/datasets/klue/ynat/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e...


Downloading data:   0%|          | 0.00/4.93M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45678 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9107 [00:00<?, ? examples/s]

Dataset klue downloaded and prepared to /root/.cache/huggingface/datasets/klue/ynat/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 9107
    })
})

In [8]:
data['train'][0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

# 데이터를 모델이 입력하기 위해 다운로드하고 변환

In [11]:
def prepare_data():
    # set tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('klue/bert-base')
    # download dataset 
    klue_df = load_dataset('klue', 'ynat')

    # dataset이 train과 validation
    train_valid_df = klue_df['train']
    test_df = klue_df['validation']

    # train 90%, test 10%
    train_valid = train_valid_df.train_test_split(train_size=0.2, shuffle=False)
    train_df, valid_df = train_valid['train'], train_valid['test']

    # set data to numpy format
    train_input, train_label = data_np(train_df, tokenizer)
    valid_input, valid_label = data_np(valid_df, tokenizer)
    test_input, test_label = data_np(test_df, tokenizer)

    return train_input, valid_input, test_input, train_label, valid_label, test_label

In [13]:
def data_np(df, tokenizer):
    # sentence1 tokenizing
    data = tokenizer(
            df['title'],
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            max_length=128
        )
    input = data['input_ids'].numpy()
    labels = df['label']
    return input, labels

# XGBoost 모델 선언 & 학습

In [35]:
# prepare data to [train, validation, test] sets
train_input, valid_input, test_input, train_label, valid_label, test_label = prepare_data()

# One hot encode label
train_label = np.eye(np.unique(train_label, axis=0).shape[0])[train_label]
valid_label = np.eye(np.unique(valid_label, axis=0).shape[0])[valid_label]

# data preprocessing for XGBoost
train_data = xgb.DMatrix(train_input, train_label)
valid_data = xgb.DMatrix(valid_input, valid_label)
data_list = [(train_data, 'train'), (valid_data, 'valid')]

# setting parameters
params = {}
params['num_class'] = 7
params['objective'] = 'multi:softmax'

# train XGBoost model
bst = xgb.train(params, train_data, num_boost_round = 1000, evals = data_list, early_stopping_rounds=100)



  0%|          | 0/2 [00:00<?, ?it/s]

[0]	train-merror:0.142857	valid-merror:0.142873
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 100 rounds.
[1]	train-merror:0.142748	valid-merror:0.1429
[2]	train-merror:0.142748	valid-merror:0.142928
[3]	train-merror:0.142638	valid-merror:0.142873
[4]	train-merror:0.14231	valid-merror:0.142928
[5]	train-merror:0.142419	valid-merror:0.142928
[6]	train-merror:0.142529	valid-merror:0.1429
[7]	train-merror:0.1422	valid-merror:0.142982
[8]	train-merror:0.141872	valid-merror:0.142955
[9]	train-merror:0.141762	valid-merror:0.142955
[10]	train-merror:0.141653	valid-merror:0.14301
[11]	train-merror:0.141325	valid-merror:0.14301
[12]	train-merror:0.141325	valid-merror:0.14301
[13]	train-merror:0.140777	valid-merror:0.14301
[14]	train-merror:0.140011	valid-merror:0.143092
[15]	train-merror:0.139464	valid-merror:0.143119
[16]	train-merror:0.139245	valid-merror:0.143146
[17]	train-merror:0.138916	valid-merro

## XGBoost 모델 평가

In [37]:
from sklearn.metrics import accuracy_score

# predict test data and evaluate
test_data = xgb.DMatrix(test_input)
test_predict = bst.predict(test_data)
accuracy_score(test_predict, test_label)

0.06160096628966729

# CatBoost 모델 선언 & 학습

In [42]:
# prepare data to [train, validation, test] sets
train_input, valid_input, test_input, train_label, valid_label, test_label = prepare_data()

# data preprocessing for catBoost
train_pool = Pool(train_input, label=train_label)
valid_pood = Pool(valid_input, label=valid_label)

# setting parameters
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    logging_level='Silent',
    iterations=1000,
    early_stopping_rounds=100
)
                        
# train catBoost model
model.fit(train_pool, eval_set=valid_pood)



  0%|          | 0/2 [00:00<?, ?it/s]

<catboost.core.CatBoostClassifier at 0x7f9003abba10>

## CatBoost 모델 평가

In [43]:
# predict test data and evaluate
test_pool = Pool(test_input)
test_predict = model.predict(test_pool)
accuracy_score(test_predict, test_label)

0.2894476776106292