# STS(Semantic Textual Similarity) Model

# 데이터 전처리 및 EDA
- 데이터 결측치 및 중복문장 제거
- 데이터 특수문자 제거
- 데이터 특징
 - 0에 편향된 분포를 가진 Label
 - binary-labele은 3 이하의 값을 나타낸다

In [None]:
!pip install sentence-transformers transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.4 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 25.9 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.17-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 55.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 62.4 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_6

In [None]:
# wandb api키 입력
!pip install wandb
!wandb login

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import wandb
import tarfile
import pandas as pd
import seaborn as sns
import re
import numpy as np
import random
import torch
import requests

from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split
from torch.nn.utils import clip_grad_norm_
from sklearn.model_selection import train_test_split
from functools import partial
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from scipy.stats import pearsonr

In [None]:
cd /content/drive/MyDrive/NLP

/content/drive/MyDrive/NLP


In [None]:
# Klue dataset 다운
!wget https://aistages-prod-server-public.s3.amazonaws.com/app/Competitions/000067/data/klue-sts-v1.1.tar.gz

In [None]:
# 압출풀기
tar_file = tarfile.open('/content/drive/MyDrive/NLP/klue-sts-v1.1.tar.gz')
tar_file.extractall(path='/content/drive/MyDrive/NLP')
tar_file.close()

In [None]:
# 데이터셋 로드
df1 = pd.read_json('/content/drive/MyDrive/NLP/klue-sts-v1.1/klue-sts-v1.1_train.json')
test = pd.read_json('/content/drive/MyDrive/NLP/klue-sts-v1.1/klue-sts-v1.1_dev.json')

In [None]:
df1.head()

Unnamed: 0,guid,source,sentence1,sentence2,labels,annotations
0,klue-sts-v1_train_00000,airbnb-rtt,숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.,숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.,"{'label': 3.7, 'real-label': 3.714285714285714...","{'agreement': '0:0:0:2:5:0', 'annotators': ['0..."
1,klue-sts-v1_train_00001,policy-sampled,위반행위 조사 등을 거부·방해·기피한 자는 500만원 이하 과태료 부과 대상이다.,시민들 스스로 자발적인 예방 노력을 한 것은 아산 뿐만이 아니었다.,"{'label': 0.0, 'real-label': 0.0, 'binary-labe...","{'agreement': '5:0:0:0:0:0', 'annotators': ['1..."
2,klue-sts-v1_train_00002,paraKQC-sampled,회사가 보낸 메일은 이 지메일이 아니라 다른 지메일 계정으로 전달해줘.,사람들이 주로 네이버 메일을 쓰는 이유를 알려줘,"{'label': 0.30000000000000004, 'real-label': 0...","{'agreement': '4:2:0:0:0:0', 'annotators': ['1..."
3,klue-sts-v1_train_00003,policy-sampled,"긴급 고용안정지원금은 지역고용대응 등 특별지원금, 지자체별 소상공인 지원사업, 취업...","고용보험이 1차 고용안전망이라면, 국민취업지원제도는 2차 고용안전망입니다.","{'label': 0.6000000000000001, 'real-label': 0....","{'agreement': '4:2:1:0:0:0', 'annotators': ['1..."
4,klue-sts-v1_train_00004,airbnb-rtt,"호스트의 답장이 늦으나, 개선될 것으로 보입니다.",호스트 응답이 늦었지만 개선될 것으로 보입니다.,"{'label': 4.7, 'real-label': 4.714285714285714...","{'agreement': '0:0:0:0:2:5', 'annotators': ['1..."


In [None]:
# 중복문장 확인
df1.duplicated(['sentence1', 'sentence2']).sum()

7

In [None]:
# 결측치 확인
df1.isnull().sum()

guid           0
source         0
sentence1      0
sentence2      0
labels         0
annotations    0
dtype: int64

In [None]:
# 중복문장 제거
d1f = df1.drop_duplicates(['sentence1','sentence2'], keep='first', ignore_index=True)

In [None]:
# 3종류의 라벨링 binaly-label은 1이 같은문장 2가 다른문장 
df1.labels[0]

{'binary-label': 1, 'label': 3.7, 'real-label': 3.714285714285714}

In [None]:
# 라벨 분리
labels = df1.labels.to_list()
labels = pd.DataFrame(labels)
print(len(labels))
labels

11668


Unnamed: 0,label,real-label,binary-label
0,3.7,3.714286,1
1,0.0,0.000000,0
2,0.3,0.333333,0
3,0.6,0.571429,0
4,4.7,4.714286,1
...,...,...,...
11663,4.0,4.000000,1
11664,0.0,0.000000,0
11665,3.7,3.666667,1
11666,4.7,4.714286,1


In [None]:
# binary-labele 은 score 3미만인 데이터로 이루어져 있다.
labels['binary-label'].value_counts()

0    6066
1    5602
Name: binary-label, dtype: int64

In [None]:
# 학습에 필요한 column만 남기기
df1 = df1[['sentence1', 'sentence2']].join(labels[['binary-label', 'real-label']])
df1.head()

Unnamed: 0,sentence1,sentence2,binary-label,real-label
0,숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.,숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.,1,3.714286
1,위반행위 조사 등을 거부·방해·기피한 자는 500만원 이하 과태료 부과 대상이다.,시민들 스스로 자발적인 예방 노력을 한 것은 아산 뿐만이 아니었다.,0,0.0
2,회사가 보낸 메일은 이 지메일이 아니라 다른 지메일 계정으로 전달해줘.,사람들이 주로 네이버 메일을 쓰는 이유를 알려줘,0,0.333333
3,"긴급 고용안정지원금은 지역고용대응 등 특별지원금, 지자체별 소상공인 지원사업, 취업...","고용보험이 1차 고용안전망이라면, 국민취업지원제도는 2차 고용안전망입니다.",0,0.571429
4,"호스트의 답장이 늦으나, 개선될 것으로 보입니다.",호스트 응답이 늦었지만 개선될 것으로 보입니다.,1,4.714286


In [None]:
# real-label 의 데이터 분포
la1 = df1.loc[(df1['real-label'] >= 0) & (df1['real-label'] < 1.0)]
la2 = df1.loc[(df1['real-label'] >= 1.0) & (df1['real-label'] < 2.0)]
la3 = df1.loc[(df1['real-label'] >= 2.0) & (df1['real-label'] < 3.0)]
la4 = df1.loc[(df1['real-label'] >= 3.0) & (df1['real-label'] < 4.0)]
la5 = df1.loc[(df1['real-label'] >= 4.0) & (df1['real-label'] < 5.0)]

print(f' Score 1미만인 데이터: {len(la1)}\n Score 2미만인 데이터: {len(la2)}\n Score 3미만인 데이터: {len(la3)}\n Score 4미만인 데이터: {len(la4)}\n Score 5미만인 데이터: {len(la5)}')

 Score 1미만인 데이터: 4350
 Score 2미만인 데이터: 906
 Score 3미만인 데이터: 810
 Score 4미만인 데이터: 2852
 Score 5미만인 데이터: 2705


In [None]:
# binary-label 의 데이터 분포
print(f' Score 3미만인 데이터: {len(la1+la2+la3)}\n Score 3이상인 데이터: {len(la4+la5)}')

 Score 3미만인 데이터: 6066
 Score 3이상인 데이터: 5557


In [None]:
# 테스트셋 중복 확인
test.duplicated(['sentence1', 'sentence2']).sum()

0

In [None]:
# 결측치 확인
test.isnull().sum()

guid           0
source         0
sentence1      0
sentence2      0
labels         0
annotations    0
dtype: int64

In [None]:
# 라벨 변경
labels2 = test.labels.to_list()
labels2 = pd.DataFrame(labels2)
test = test[['sentence1', 'sentence2']].join(labels2[['binary-label', 'real-label']])

## KorNLU Datasets 추가

[출처](https://github.com/kakaobrain/KorNLUDatasets/)

In [None]:
# KorNLUdataset 다운
!git clone https://github.com/kakaobrain/KorNLUDatasets/

fatal: destination path 'KorNLUDatasets' already exists and is not an empty directory.


In [None]:
path = '/content/drive/MyDrive/NLP/KorNLUDatasets/KorSTS/sts-train.tsv'

In [None]:
df2 = pd.read_csv(path, sep="\t+")

  return func(*args, **kwargs)


In [None]:
# 스코어로 되어있는 형태
df2

In [None]:
# 중복값 확인
df2.duplicated(['sentence1', 'sentence2']).sum()

In [None]:
# 중복값 제거
df2 = df2.drop_duplicates(['sentence1','sentence2'], keep='first', ignore_index=True)

In [None]:
# 위와 같이 스코어 3 미만은 0 3 이상은 1 로 분류
df2['binary-label'] = df2['score'].apply(lambda x: 0 if x < 3  else  1)

In [None]:
# 스코어 확인
la11 = len(df2.loc[(df2['score'] >= 0) & (df2['score'] < 3.0)])
la11

In [None]:
# 분류 확인
df2['binary-label'].value_counts()

In [None]:
# 이름변경
df2.rename(columns = {'score':'real-label'},inplace=True)

In [None]:
# 필요한 column만 남기기
df2 = df2[['sentence1', 'sentence2', 'binary-label', 'real-label']]
df2

In [None]:
# 두 데이터 합치기
df = pd.concat([df1,df2],axis=0, join='inner', ignore_index=True)

In [None]:
# 확인 11661 + 5700
df.shape

In [None]:
# real-label 의 데이터 분포
la1 = df.loc[(df['real-label'] >= 0) & (df['real-label'] < 1.0)]
la2 = df.loc[(df['real-label'] >= 1.0) & (df['real-label'] < 2.0)]
la3 = df.loc[(df['real-label'] >= 2.0) & (df['real-label'] < 3.0)]
la4 = df.loc[(df['real-label'] >= 3.0) & (df['real-label'] < 4.0)]
la5 = df.loc[(df['real-label'] >= 4.0) & (df['real-label'] < 5.0)]

print(f' Score 1미만인 데이터: {len(la1)}\n Score 2미만인 데이터: {len(la2)}\n Score 3미만인 데이터: {len(la3)}\n Score 4미만인 데이터: {len(la4)}\n Score 5미만인 데이터: {len(la5)}')

In [None]:
# binary-label 의 데이터 분포
print(f' Score 3미만인 데이터: {len(la1+la2+la3)}\n Score 3이상인 데이터: {len(la4+la5)}')

In [None]:
# real-label 의 분포도 히스토그램 그래프
ax = plt.subplot()
sns.distplot(df['real-label'], hist=True, kde=False)
plt.xlabel('label')
plt.ylabel('value')
sns.set(rc = {'figure.figsize':(10,10)})

plt.show()

In [None]:
# binary-label의 빈도수 그래프
sns.countplot(df['binary-label'], palette= "RdPu")

In [None]:
# 데이터 전처리 
import html
import regex as re
from bs4 import BeautifulSoup


def preprocess(sentence):
    sen = BeautifulSoup(html.unescape(sentence), 'html.parser').text   
    sen = sen.replace("\n", " ")                                    # \n 공백으로 대체 
    sen = re.sub('"',' ', sen)                                      # 따음표를 공백으로 대체
    sen = re.sub("[^a-zA-Z0-9가-힣]", " ", sen)                  # 특수문자 제거하고 한글과 영어만 사용
    return sen



def preprocess_train(df):
    sentence1 = df['sentence1'].tolist()
    sentence2 = df['sentence2'].tolist()
    real_label = df['real-label'].tolist()
    label = df['binary-label'].tolist()

    processed1 = []
    processed2 = []

    for sen1 in sentence1:
        processed1.append(preprocess(sen1))
    for sen2 in sentence2:
        processed2.append(preprocess(sen2))
    
    processed_df = pd.DataFrame(list(zip(processed1, processed2, real_label, label)),
                        columns = ['sentence1', 'sentence2', 'real-label', 'binary-label'])


    return processed_df

In [None]:
df = preprocess_train(df)

In [None]:
# 데이터셋 9:1 비율로 train과 val 데이터 나누기
train, val = train_test_split(df, test_size=0.1, random_state = 42)

In [None]:
# 인덱스 초기화
train_data = train.reset_index().drop(['index'], axis = 1)
valid_data = val.reset_index().drop(['index'], axis = 1)
test_data = test.reset_index().drop(['index'], axis = 1)

In [None]:
# 디바이스 설정
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)

# available GPUs : 1
GPU name : Tesla P100-PCIE-16GB
cuda


In [None]:
# 모델, 토크나이저 불러오기
model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", num_labels = 1)
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

# 모델링

In [None]:
# 데이터 프레임의 입력을 받아 (input, target) 형태의 Dataset 생성
class CustomDataset(Dataset):

    def __init__(self, data) -> None: 
        self.data = data            
        self.input, self.label = list(zip(self.data['sentence1'], self.data['sentence2'])), self.data['real-label']

    def __len__(self):
        return len(self.label) 

    def __getitem__(self, index):
        return self.input[index], self.label[index]  

In [None]:
def custom_collate_fn(batch, max_length):

    global tokenizer
  
    input_list, target_list = zip(*batch) 
    tensorized_input = tokenizer.batch_encode_plus(

        [(sentences[0], sentences[1]) for sentences in input_list],
        max_length = max_length, # 
        padding= "max_length",
        add_special_tokens=True,
        truncation=True,
        return_tensors='pt',
    )
    
    tensorized_label = torch.tensor(target_list)
  
    return tensorized_input, tensorized_label

In [None]:
test_data = test.reset_index().drop(['index'], axis = 1)
test_dataset = CustomDataset(test_data)

In [None]:
train_dataset = CustomDataset(train_data)
valid_dataset = CustomDataset(valid_data)
test_dataset = CustomDataset(test_data)

In [None]:
def save_checkpoint(model, optimizer, scheduler, epoch, loss):

    file_name = f'/content/drive/MyDrive/AI09/model.ckpt.{epoch}'
        
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss' : loss
        }, 
        file_name
    )
    
    print(f"Saving epoch {epoch} checkpoint at {file_name}")

In [None]:
def validate(model, dataloader):    

    model.eval()
    
    total_loss = 0
    batch_count = 0
    batch_loss = 0
    pred_list = None 

    for step, batch in enumerate(dataloader):       
        batch_count += 1
        batch = tuple(item.to(device) for item in batch)

        batch_input, batch_label = batch

        with torch.no_grad():
            outputs = model(**batch_input, labels = batch_label.float()) 

        loss = outputs.loss 
        pred = outputs.logits.squeeze()
        
        if pred_list is None:
           pred_list = pred.detach().cpu().numpy()
           label_list = batch_label.detach().cpu().numpy()
        else:
            pred_list = np.append(pred_list, pred.detach().cpu().numpy(), axis=0)
            label_list = np.append(label_list, batch_label.detach().cpu().numpy(), axis=0)        
        
        batch_loss += loss.item()
        total_loss += loss.item()

        if (step % 10) == 0 and step != 0:  
            print(f"Step : {step}, valid Loss : {batch_loss / batch_count:.4f}")
            wandb.log({'valid_loss': batch_loss / batch_count})    
            batch_loss = 0
            batch_count = 0

    fone_pred = np.where(pred_list >=3, 1, 0)
    fone_label = np.where(label_list >=3, 1, 0)     
    fone = f1_score(fone_pred, fone_label) * 100
    p_score = pearsonr(pred_list, label_list)[0] * 100  
       
    total_valid_loss = total_loss / (step + 1)              
           
    wandb.log({'total_valid_loss': total_valid_loss, "total_f1_score ": fone, "total_pearsonr" : p_score})     
   
    return total_valid_loss, fone, p_score

In [None]:
def train(model, optimizer, scheduler, train_dataloader, valid_dataloader, epochs):   

    wandb.watch(model, log="all", log_freq = 10)
      
    for epoch in range(epochs):
        print(f'****** Starting To Train Epoch #{epoch} ******')

        total_loss = 0
        batch_loss = 0
        batch_count = 0      

        model.to(device)
        model.train()

        
        for step, batch in enumerate(train_dataloader):
            batch_count += 1
            batch = tuple(item.to(device) for item in batch)
           
            batch_input, batch_label = batch
            model.zero_grad()

            outputs = model(**batch_input, labels = batch_label.float())
            loss = outputs.loss 
            
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
             
            clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            
            if (step % 10) == 0 and step != 0:
                wandb.log({'train_loss': batch_loss / batch_count, 'train_lr': optimizer.param_groups[0]['lr']})                    
                print(f"Epoch: {epoch}, Step : {step}, LR : {optimizer.param_groups[0]['lr']}, Avg Loss : {batch_loss / batch_count:.4f}")
                batch_loss, batch_count = 0,0
                
        wandb.log({'total_train_loss': total_loss / (step + 1), 'total_train_lr': optimizer.param_groups[0]['lr'], "epoch" : (epoch + 1)})
        print(f"Epoch {epoch} total_train_loss : {total_loss/(step+1):.4f}")
        print(f"***** Finish To Train Epoch {epoch} *****\n") 

        print(f"*****Epoch {epoch} Valid Start*****")
        total_valid_loss, fone, p_score = validate(model, valid_dataloader)
        print('total_valid_loss : ', total_valid_loss, "val_f1_score : ",  fone,  "val_pearsonr :",  p_score)  
        print(f"Epoch {epoch} total_Valid Loss : {total_valid_loss:.4f}") 
        print(f"*****Epoch {epoch} Valid Finish*****\n")
        save_checkpoint(model, optimizer, scheduler,  epoch, total_valid_loss)



    print("Train Finished")

In [None]:
sweep_config = {
    
    "name" : "AI09_v0",   
    "method": "bayes",
    "metric": {
        "name" : "total_valid_loss", 
        "goal" : "minimize"
                },
    
    "parameters": { 
        "epochs" : {
            "distribution" : "categorical",
            "values" : [4]},                     
        "learning_rate" : {
            "distribution" : "categorical",
            "values" : [1e-5]},                     
        "eps" : {
            "distribution" : "categorical",
            "values" : [1e-8]
        },
        "train_batch_size" : {
            "distribution" : "categorical",
            "values" : [8]
        },
        "valid_batch_size" : {
            "distribution" : "categorical",
            "values" : [16]
        },
        "weight_decay" : {
            "distribution" : "categorical",
            "values" : [0]
        },
        "warm_up_ratio" : {
            "distribution" : "categorical",
            "values" : [0]
        },
        "max_length" : {
            "distribution" : "categorical",       
            "values" : [128]
        },
        "grad_norm" : {
            "distribution" : "categorical",
            "values" : [1.0]
        },
    },         
    "early_terminate" : {
        "type": "hyperband", 
        "min_iter" : 2,
        "eta" : 2
        }
}

In [None]:
def initializer(config=None):

    wandb.init(config=config)
    
    no_decay = ['bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    

    optimizer = AdamW(
                      optimizer_grouped_parameters,
                      lr = 1e-5,
                      eps = 1e-8
                      ) 
    num_training_steps = epochs * len(train_dataloader)

    scheduler = get_linear_schedule_with_warmup(
                                                optimizer=optimizer, 
                                                num_warmup_steps= (num_training_steps * 0),
                                                num_training_steps = num_training_steps
                                                )
    
 
    train(model, optimizer, scheduler, train_dataloader, valid_dataloader, epochs)   

In [None]:
train_dataloader = DataLoader(
                              train_dataset,
                              batch_size = 8,
                              sampler = RandomSampler(train_dataset),
                              collate_fn = partial(custom_collate_fn, max_length=128)
                              )
valid_dataloader = DataLoader(
                              valid_dataset,
                              batch_size = 16,
                              sampler = SequentialSampler(valid_dataset),
                              collate_fn = partial(custom_collate_fn, max_length= 128)
                              )
test_dataloader = DataLoader(
                            test_dataset, 
                            batch_size = 16,
                            sampler = SequentialSampler(test_dataset),
                            collate_fn = partial(custom_collate_fn, max_length= 128)
                            )

## 추론 검증

In [None]:
def test(model, dataloader):    

    model.to(device)
    model.eval()
    
    total_loss = 0
    batch_count = 0
    batch_loss = 0
    
    pred_np = None

    for step, batch in enumerate(dataloader):       
        batch_count += 1
        batch = tuple(item.to(device) for item in batch)

        batch_input, batch_label = batch

        with torch.no_grad():
             outputs = model(**batch_input, labels = batch_label)
    
        loss = outputs.loss
        pred = outputs.logits.squeeze()

        if pred_np is None:
            pred_np = pred.detach().cpu().numpy()
            label_np = batch_label.detach().cpu().numpy()
        else:
            pred_np = np.append(pred_np, pred.detach().cpu().numpy(), axis=0)
            label_np = np.append(label_np, batch_label.detach().cpu().numpy(), axis=0)
        
        batch_loss += loss.item()
        total_loss += loss.item()
                        
        if (step % 10) == 0 and step != 0:
            #print('test_loss : ' ,batch_loss / batch_count)                           
            batch_loss, batch_count = 0, 0

    total_valid_loss = total_loss / (step + 1)

    fone_pred = np.where(pred_np >=3, 1, 0)
    fone_label = np.where(label_np >=3, 1, 0)
       
    fone= f1_score(fone_pred , fone_label) * 100
    p_score = pearsonr(pred_np, label_np)[0] * 100           
    print('total_test_loss : ' , total_valid_loss, "total_f1_score : " , fone, "total_pearsonr:" , p_score)

In [None]:
epochs = 4
sweep_id = wandb.sweep(sweep_config, project = "AI09_v0")
wandb.agent(sweep_id, initializer, count = 1)

In [None]:
ckpt1 = '/content/drive/MyDrive/AI09/model.ckpt.0'
ckpt2 = '/content/drive/MyDrive/AI09/model.ckpt.1'
ckpt3 = '/content/drive/MyDrive/AI09/model.ckpt.2'
ckpt4 = '/content/drive/MyDrive/AI09/model.ckpt.3'

In [None]:
model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", num_labels = 1)
all_checkpoints = [ckpt1, ckpt2, ckpt3, ckpt4]

for checkpoint in all_checkpoints:
    loaded_ckpt = torch.load(checkpoint)
    loaded_ckpt['epoch'], loaded_ckpt['loss']
    model.load_state_dict(loaded_ckpt["model_state_dict"])
    test(model, test_dataloader)

# Hyper Parameter Tuning
- wandb에 내장된 sweep 기능을 활용하여 하이퍼 파라미터 튜닝을 진행

Tuning 한 Parameter들

> epochs : 4   
> IR : 1e-5, 2e-5, 3e-5  
> eps : 1e-8  
> train_set_batch_size : 8, 16  
> weight_decay : 0, 0.01  
> warm_up_ratio : 0, 0.1

![image](https://velog.velcdn.com/images/khyait/post/8b8dbd46-12e3-4716-b018-a10a2c183c20/image.png)

- 튜닝 결과
> epochs : 4   
> IR : 2e-5  
> eps : 1e-8  
> train_set_batch_size : 8  
> weight_decay : 0 
> warm_up_ratio : 0.1

위와 같은 Parameter로 진행

In [None]:
sweep_config = {
    
    "name" : "AI09_v3",   
    "method": "bayes",
    "metric": {
        "name" : "total_valid_loss", 
        "goal" : "minimize"
                },
    
    "parameters": { 
        "epochs" : {
            "distribution" : "categorical",
            "values" : [4]},                     
        "learning_rate" : {
            "distribution" : "categorical",
            "values" : [2e-5]},                     
        "eps" : {
            "distribution" : "categorical",
            "values" : [1e-8]
        },
        "train_batch_size" : {
            "distribution" : "categorical",
            "values" : [8]
        },
        "valid_batch_size" : {
            "distribution" : "categorical",
            "values" : [16]
        },
        "weight_decay" : {
            "distribution" : "categorical",
            "values" : [0]
        },
        "warm_up_ratio" : {
            "distribution" : "categorical",
            "values" : [0.1]
        },
        "max_length" : {
            "distribution" : "categorical",       
            "values" : [128]
        },
        "grad_norm" : {
            "distribution" : "categorical",
            "values" : [1.0]
        },
    },         
    "early_terminate" : {
        "type": "hyperband", 
        "min_iter" : 2,
        "eta" : 2
        }
}

In [None]:
def save_checkpoint(model, optimizer, scheduler, epoch, loss):

    file_name = f'/content/drive/MyDrive/AI09/model_v3.ckpt.{epoch}'
        
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss' : loss
        }, 
        file_name
    )
    
    print(f"Saving epoch {epoch} checkpoint at {file_name}")

In [None]:
def initializer(config=None):

    wandb.init(config=config)
    
    no_decay = ['bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    

    optimizer = AdamW(
                      optimizer_grouped_parameters,
                      lr = 2e-5,
                      eps = 1e-8
                      ) 
    num_training_steps = epochs * len(train_dataloader)

    scheduler = get_linear_schedule_with_warmup(
                                                optimizer=optimizer, 
                                                num_warmup_steps= (num_training_steps * 0.1),
                                                num_training_steps = num_training_steps
                                                )
    
 
    train(model, optimizer, scheduler, train_dataloader, valid_dataloader, epochs)   

In [None]:
epochs = 4
sweep_id = wandb.sweep(sweep_config, project = "AI09_v3")
wandb.agent(sweep_id, initializer, count = 1)

Create sweep with ID: n5zwrnca
Sweep URL: https://wandb.ai/kdb/AI09_v3/sweeps/n5zwrnca


[34m[1mwandb[0m: Agent Starting Run: phuahdr5 with config:
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	grad_norm: 1
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	max_length: 128
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	valid_batch_size: 16
[34m[1mwandb[0m: 	warm_up_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33mkdb[0m. Use [1m`wandb login --relogin`[0m to force relogin




****** Starting To Train Epoch #0 ******
Epoch: 0, Step : 10, LR : 2.814738996929376e-07, Avg Loss : 8.6906
Epoch: 0, Step : 20, LR : 5.373592630501535e-07, Avg Loss : 9.3178
Epoch: 0, Step : 30, LR : 7.932446264073696e-07, Avg Loss : 6.6221
Epoch: 0, Step : 40, LR : 1.0491299897645856e-06, Avg Loss : 7.9820
Epoch: 0, Step : 50, LR : 1.3050153531218015e-06, Avg Loss : 5.4882
Epoch: 0, Step : 60, LR : 1.5609007164790175e-06, Avg Loss : 6.1227
Epoch: 0, Step : 70, LR : 1.8167860798362336e-06, Avg Loss : 5.7768
Epoch: 0, Step : 80, LR : 2.0726714431934496e-06, Avg Loss : 4.3740
Epoch: 0, Step : 90, LR : 2.3285568065506653e-06, Avg Loss : 3.4860
Epoch: 0, Step : 100, LR : 2.5844421699078814e-06, Avg Loss : 2.7580
Epoch: 0, Step : 110, LR : 2.8403275332650976e-06, Avg Loss : 3.1223
Epoch: 0, Step : 120, LR : 3.0962128966223133e-06, Avg Loss : 3.0410
Epoch: 0, Step : 130, LR : 3.3520982599795294e-06, Avg Loss : 2.8393
Epoch: 0, Step : 140, LR : 3.6079836233367456e-06, Avg Loss : 2.3835
Epoch

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▃▆█
total_f1_score,▁▃██
total_pearsonr,▁▆██
total_train_loss,█▂▁▁
total_train_lr,█▆▃▁
total_valid_loss,█▁▁▂
train_loss,█▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_lr,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
valid_loss,▅▂▅█▃▄▅▇▅▅▂▁▃▅▁▃▃▅▃▃▂▂▃▄▁▂▄▅▂▃▃▂▃▅▁▃▄▅▃▃

0,1
epoch,4.0
total_f1_score,93.15389
total_pearsonr,95.27955
total_train_loss,0.10423
total_train_lr,0.0
total_valid_loss,0.29429
train_loss,0.10227
train_lr,0.0
valid_loss,0.29395


In [None]:
ckpt1 = '/content/drive/MyDrive/AI09/model_v3.ckpt.0'
ckpt2 = '/content/drive/MyDrive/AI09/model_v3.ckpt.1'
ckpt3 = '/content/drive/MyDrive/AI09/model_v3.ckpt.2'
ckpt4 = '/content/drive/MyDrive/AI09/model_v3.ckpt.3'
ckpt5 = '/content/drive/MyDrive/AI09/model_v3.ckpt.4'

In [None]:
model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", num_labels = 1)
all_checkpoints = [ckpt1, ckpt2, ckpt3, ckpt4]

for checkpoint in all_checkpoints:
    loaded_ckpt = torch.load(checkpoint)
    loaded_ckpt['epoch'], loaded_ckpt['loss']
    model.load_state_dict(loaded_ckpt["model_state_dict"])
    test(model, test_dataloader)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'clas

total_test_loss :  0.45487885092367386 total_f1_score :  84.95934959349594 total_pearsonr: 90.83055423360408
total_test_loss :  0.46361641060870673 total_f1_score :  85.37074148296594 total_pearsonr: 91.37930542388045
total_test_loss :  0.44515289203261077 total_f1_score :  85.71428571428572 total_pearsonr: 91.37805093690585
total_test_loss :  0.3944186128929831 total_f1_score :  86.18556701030928 total_pearsonr: 91.88289655591177


# Data Augmentation 
- 불균형한 데이터셋을 보강하기 위해 Data Augmentation 실행  

진행순서
- 문장 생성
- 유사도 높은 문장을 모아 data pair를 구성
-  data pair의 유사도를 측정해 라벨링

In [None]:
# hanspell 을 통해 sentence1, sentence2 맞춤법 검사
!pip install git+https://github.com/ssut/py-hanspell.git

In [None]:
from hanspell import spell_checker

checked_lst = []
for i in range(len(df)):
  checked = spell_checker.check(df['sentence1'][i]).as_dict()['checked']
  checked_lst.append(checked)

In [None]:
temp = spell_checker.check(df['sentence2'][1308]).as_dict()

In [None]:
checked_lst2 = []
temp_lst = []
for i in range(len(df)):
  temp = spell_checker.check(df['sentence2'][i]).as_dict()
  checked = temp['checked']
  err = temp['errors']
  checked_lst2.append(checked)
  temp_lst.append(err)

In [None]:
# mean 0.45 error fixed
np.mean(temp_lst)

In [None]:
df['spell_checked1'] = checked_lst
df['spell_checked2'] = checked_lst2
df.head()

![](https://velog.velcdn.com/images/khyait/post/34053b8f-a4b2-4fb6-a56f-cf05fc703f68/image.png)

In [None]:
df[['sentence1','sentence2','real_label','binary_label']].to_csv('klue_sts_train_aug.csv', index=False)

In [None]:
train_aug = pd.read_csv('klue_sts_train_aug.csv')

In [None]:
train, val = train_test_split((train_aug), test_size=0.1, random_state = 42)

In [None]:
train_data = train.reset_index().drop(['index'], axis = 1)
valid_data = val.reset_index().drop(['index'], axis = 1)
test_data = test_df.reset_index().drop(['index'], axis = 1)

In [None]:
sns.distplot(train_data['real_label'])
sns.distplot(valid_data['real_label'])

![](https://velog.velcdn.com/images/khyait/post/fe307b0c-c0da-4004-8743-72afa5d6347e/image.png)

In [None]:
sum_train = pd.concat([train_data, valid_data])

In [None]:
from sentence_transformers.readers import InputExample
# sentence_trainsforemrs 패키지의 입력 형태 InputExample(texts, label)
gold_samples = []
test_samples = []

for sentence1, sentence2, score in zip(sum_train['sentence1'], sum_train['sentence2'], sum_train['real_label']):

  score = float(score) / 5.0 # 유사도 측정을 위해 scaling
  
  inp_example = InputExample(
      texts = [sentence1, sentence2],
      label = score,
  )
  rev_inp_example = InputExample(       #유사도 측정으로 샘플링 되기 때문에 순서를 바꿔서도 입력
      texts = [sentence2, sentence1],
      label = score)
  
  gold_samples.append(inp_example)
  gold_samples.append(rev_inp_example)


for sentence1, sentence2, score in zip(test_data['sentence1'], test_data['sentence2'], test_data['real_label']):

  score = float(score) / 5.0
  
  inp_example = InputExample(
      texts = [sentence1, sentence2],
      label = score,
  )
  rev_inp_example = InputExample(
      texts = [sentence2, sentence1],
      label = score)
  
  test_samples.append(inp_example)
  test_samples.append(rev_inp_example)

In [None]:
unique_sentences = set()
silver_data = []

for sample in gold_samples:
    unique_sentences.update(sample.texts)

print(len(unique_sentences))

In [None]:
unique_sentences = list(unique_sentences)
sent2idx = {sentence: idx for idx, sentence in enumerate(unique_sentences)}
duplicates = set((sent2idx[data.texts[0]], sent2idx[data.texts[1]]) for data in gold_samples)

In [None]:
# 4 epochs, train_batch
semantic_search_model = SentenceTransformer("Huffon/sentence-klue-roberta-base")
# 유니크 데이터 임베딩
embeddings = semantic_search_model.encode(unique_sentences, batch_size = 128, convert_to_tensor=True)

In [None]:
top_k = 3
# cos_sim 기준 top 3 문장 조합을 추가 데이터로 활용
progress = tqdm.tqdm(unit="docs", total=len(sent2idx))
for idx in range(len(unique_sentences)):
    sentence_embedding = embeddings[idx]              
    cos_scores = util.cos_sim(sentence_embedding, embeddings)[0]  #인덱스를 바꿔가면서 모든 문장과 비교
    cos_scores = cos_scores.cpu()
    progress.update(1)

    #We use torch.topk to find the highest 3 scores
    top_results = torch.topk(cos_scores, k=top_k+1) #Top 3 문장 추출
    
    for score, iid in zip(top_results[0], top_results[1]):
        if iid != idx and (iid, idx) not in duplicates:
            silver_data.append((unique_sentences[idx], unique_sentences[iid]))
            duplicates.add((idx,iid))

progress.reset()
progress.close()

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
# cross_encoder 훈련 세팅
cross_encoder = CrossEncoder("klue/roberta-base", num_labels=1)
train_batch_size = 16
num_epochs = 4
max_seq_length = 128

# Train Dataloader 생성
train_dataloader = DataLoader(gold_samples, shuffle=True, batch_size=16)
# Test data로 성능 지표 확인
evaluator = CECorrelationEvaluator.from_input_examples(test_samples, name='sts-test')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
print(warmup_steps)

In [None]:
cross_encoder.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps)

In [None]:
silver_scores = cross_encoder.predict(silver_data)
# All model predictions should be between [0,1]
assert all(0.0 <= score <= 1.0 for score in silver_scores)

In [None]:
silver_sentence1, silver_sentence2 = zip(*silver_data)

In [None]:
silver_data = pd.DataFrame({
    "sentence1" : silver_sentence1,
    "sentence2" : silver_sentence2,
    "real_label" : silver_scores * 5.0}
)

In [None]:
silver_data

![](https://velog.velcdn.com/images/khyait/post/0a750598-714c-4d38-a987-3b8bb605d88c/image.png)

In [None]:
sns.displot(silver_data['real_label'])

![](https://velog.velcdn.com/images/khyait/post/912bee07-f7c8-48a5-adae-603cbb847e57/image.png)

In [None]:
def make_balance(x):
  """
  그래프 상의 x축을 기준으로 A, B, C, Z 섹션을 나눠주기 위한 함수
  섹션을 나눠준 기준 : 그래프상에서 가장 undersampling이 필요한 부분을 임의로 나눠줌
  Z섹션은 undersampling이 필요하지 않다고 판단한 부분
  """
  if x <= 3:
    return 'Z' 

  elif (x > 3) & (x < 3.5):
    return 'A'

  elif (x >= 3.5) & (x < 3.8):
    return 'B'

  elif (x >= 3.8) & (x < 4.6):
    return 'C'
  else:
    return 'Z'

In [None]:
silver_data['drop_label'] = silver_data['real_label'].apply(make_balance)

In [None]:
silver_shuffled = silver_data.sample(frac=1).reset_index(drop=True)

In [None]:
indexNames = silver_shuffled[ silver_shuffled['drop_label'] == 3.5 ][:2500].index  
silver_shuffled.drop(indexNames , inplace=True)

indexNames = silver_shuffled[ silver_shuffled['drop_label'] == 3.8 ][:40000].index
silver_shuffled.drop(indexNames , inplace=True)

indexNames = silver_shuffled[ silver_shuffled['drop_label'] == 4.75 ][:30000].index
silver_shuffled.drop(indexNames , inplace=True)

In [None]:
sns.distplot(silver_shuffled['real_label'])

![](https://velog.velcdn.com/images/khyait/post/bfa0363e-47e2-4a23-80e0-2e8093b99faa/image.png)


In [None]:
sampled_df = silver_shuffled.sample(n=15000, random_state=17)
# 아까 만든 drop_label 컬럼 삭제
sampled_df = sampled_df.drop('drop_label', axis=1)

# 마지막으로 distplot 확인
sns.distplot(sampled_df['real_label'])

![](https://velog.velcdn.com/images/khyait/post/3cc8081f-eb88-4f7d-9dba-8468260b53c0/image.png)

In [None]:
aug_train_data.to_csv('aug_klue_sts_train.csv', index=False)
aug_valid_data.to_csv('aug_klue_sts_vaild.csv', index=False)

In [None]:
# 증강 데이터 다운
train = pd.read_csv('/content/drive/MyDrive/NLP/aug_klue_sts_train.csv')
val =  pd.read_csv('/content/drive/MyDrive/NLP/aug_klue_sts_vaild.csv')
test = pd.read_json('/content/drive/MyDrive/NLP/klue-sts-v1.1/klue-sts-v1.1_dev.json')

In [None]:
train = preprocess_train(train)
val = preprocess_train(val)

In [None]:
train.rename(columns= {'real_label':'real-label', 'binary_label':'binary-label'}, inplace=True)
val.rename(columns= {'real_label':'real-label', 'binary_label':'binary-label'}, inplace=True)

val['binary-label'] = val['real-label'].apply(lambda x: 0 if x < 3  else  1)

train = train.drop_duplicates(['sentence1','sentence2'], keep='first', ignore_index=True)
val = val.drop_duplicates(['sentence1','sentence2'], keep='first', ignore_index=True)

In [None]:
# 인덱스 초기화
train_data = train.reset_index().drop(['index'], axis = 1)
valid_data = val.reset_index().drop(['index'], axis = 1)
test_data = test.reset_index().drop(['index'], axis = 1)

In [None]:
train_dataset = CustomDataset(train_data)
valid_dataset = CustomDataset(valid_data)
test_dataset = CustomDataset(test_data)

In [None]:
def save_checkpoint(model, optimizer, scheduler, epoch, loss):

    file_name = f'/content/drive/MyDrive/AI09/model_v4.ckpt.{epoch}'
        
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss' : loss
        }, 
        file_name
    )
    
    print(f"Saving epoch {epoch} checkpoint at {file_name}")

In [None]:
train_dataloader = DataLoader(
                              train_dataset,
                              batch_size = 8,
                              sampler = RandomSampler(train_dataset),
                              collate_fn = partial(custom_collate_fn, max_length=128)
                              )
valid_dataloader = DataLoader(
                              valid_dataset,
                              batch_size = 16,
                              sampler = SequentialSampler(valid_dataset),
                              collate_fn = partial(custom_collate_fn, max_length= 128)
                              )
test_dataloader = DataLoader(
                            test_dataset, 
                            batch_size = 16,
                            sampler = SequentialSampler(test_dataset),
                            collate_fn = partial(custom_collate_fn, max_length= 128)
                            )

In [None]:
epochs = 4
sweep_id = wandb.sweep(sweep_config, project = "AI09_v1")
wandb.agent(sweep_id, initializer, count = 1)

Create sweep with ID: zm92hszq
Sweep URL: https://wandb.ai/kdb/AI09_v1/sweeps/zm92hszq


[34m[1mwandb[0m: Agent Starting Run: nbb3o43b with config:
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	grad_norm: 1
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	max_length: 128
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	valid_batch_size: 16
[34m[1mwandb[0m: 	warm_up_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33mkdb[0m. Use [1m`wandb login --relogin`[0m to force relogin




****** Starting To Train Epoch #0 ******
Epoch: 0, Step : 10, LR : 9.987333026255182e-06, Avg Loss : 5.0947
Epoch: 0, Step : 20, LR : 9.975817595578076e-06, Avg Loss : 3.5380
Epoch: 0, Step : 30, LR : 9.964302164900968e-06, Avg Loss : 3.0817
Epoch: 0, Step : 40, LR : 9.95278673422386e-06, Avg Loss : 2.8018
Epoch: 0, Step : 50, LR : 9.941271303546754e-06, Avg Loss : 2.6660
Epoch: 0, Step : 60, LR : 9.929755872869646e-06, Avg Loss : 2.6548
Epoch: 0, Step : 70, LR : 9.91824044219254e-06, Avg Loss : 2.3422
Epoch: 0, Step : 80, LR : 9.906725011515431e-06, Avg Loss : 2.6845
Epoch: 0, Step : 90, LR : 9.895209580838325e-06, Avg Loss : 2.6897
Epoch: 0, Step : 100, LR : 9.883694150161217e-06, Avg Loss : 2.8178
Epoch: 0, Step : 110, LR : 9.872178719484109e-06, Avg Loss : 2.1376
Epoch: 0, Step : 120, LR : 9.860663288807002e-06, Avg Loss : 1.1898
Epoch: 0, Step : 130, LR : 9.849147858129894e-06, Avg Loss : 1.0888
Epoch: 0, Step : 140, LR : 9.837632427452788e-06, Avg Loss : 0.9340
Epoch: 0, Step : 1

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▃▆█
total_f1_score,▁▆▇█
total_pearsonr,▁▅▇█
total_train_loss,█▃▂▁
total_train_lr,█▆▃▁
total_valid_loss,█▅▂▁
train_loss,█▃▃▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_lr,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
valid_loss,▅▅▇▅▅▆▆▇▅█▆▄▄▄▄▄▄▃▃▅▂▁▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▂▁▁

0,1
epoch,4.0
total_f1_score,95.03931
total_pearsonr,97.96205
total_train_loss,0.1251
total_train_lr,0.0
total_valid_loss,0.08665
train_loss,0.12985
train_lr,0.0
valid_loss,0.08885


In [None]:
ckpt1 = '/content/drive/MyDrive/AI09/model_v4.ckpt.0'
ckpt2 = '/content/drive/MyDrive/AI09/model_v4.ckpt.1'
ckpt3 = '/content/drive/MyDrive/AI09/model_v4.ckpt.2'
ckpt4 = '/content/drive/MyDrive/AI09/model_v4.ckpt.3'

In [None]:
def test(model, dataloader):    

    model.to(device)
    model.eval()
    
    total_loss = 0
    batch_count = 0
    batch_loss = 0
    
    pred_np = None

    for step, batch in enumerate(dataloader):       
        batch_count += 1
        batch = tuple(item.to(device) for item in batch)

        batch_input, batch_label = batch

        with torch.no_grad():
             outputs = model(**batch_input, labels = batch_label)
    
        loss = outputs.loss
        pred = outputs.logits.squeeze()

        if pred_np is None:
            pred_np = pred.detach().cpu().numpy()
            label_np = batch_label.detach().cpu().numpy()
        else:
            pred_np = np.append(pred_np, pred.detach().cpu().numpy(), axis=0)
            label_np = np.append(label_np, batch_label.detach().cpu().numpy(), axis=0)
        
        batch_loss += loss.item()
        total_loss += loss.item()
                        
        if (step % 10) == 0 and step != 0:
                          
            batch_loss, batch_count = 0, 0

    total_valid_loss = total_loss / (step + 1)

    fone_pred = np.where(pred_np >=3, 1, 0)
    fone_label = np.where(label_np >=3, 1, 0)
       
    fone= f1_score(fone_pred , fone_label) * 100
    p_score = pearsonr(pred_np, label_np)[0] * 100           
    print('total_test_loss : ' , total_valid_loss, "total_f1_score : " , fone, "total_pearsonr:" , p_score)

In [None]:
model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", num_labels = 1)
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier

In [None]:
all_checkpoints = [ckpt1, ckpt2, ckpt3, ckpt4]

for checkpoint in all_checkpoints:
    loaded_ckpt = torch.load(checkpoint, map_location=device)
    loaded_ckpt['epoch'], loaded_ckpt['loss']
    model.load_state_dict(loaded_ckpt["model_state_dict"])
    test(model, test_dataloader)

total_test_loss :  0.5456787615794046 total_f1_score :  85.94377510040161 total_pearsonr: 91.0554006124478
total_test_loss :  0.3652469205585013 total_f1_score :  84.07643312101911 total_pearsonr: 91.61072143340733
total_test_loss :  0.36333515667818367 total_f1_score :  85.77319587628865 total_pearsonr: 92.3367245640512
total_test_loss :  0.36961695699515 total_f1_score :  86.00823045267488 total_pearsonr: 92.35652059000972


# 최종모델 선정

In [None]:
loaded_ckpt = torch.load(ckpt4, map_location=device)
model.load_state_dict(loaded_ckpt["model_state_dict"])

<All keys matched successfully>

In [None]:
test(model, test_dataloader)

total_test_loss :  0.36961695699515 total_f1_score :  86.00823045267488 total_pearsonr: 92.35652059000972
