# **Settings**

In [29]:
import os
import warnings
warnings.filterwarnings('ignore')                       # warning 출력 false

import numpy as np 
import pandas as pd

# **3. Bert 모델 가져오기**

## **1) Settings**

In [31]:
import torch 
from sklearn.preprocessing import LabelEncoder
from ratsnlp.nlpbook.classification import ClassificationExample

# data setting
class CustomCorpus:
    def __init__(self):
        pass
                
    def get_examples(self,data_root_path,mode):
        self.data = pd.read_csv(f'./data2/{mode}.csv')

        examples = []
        for temp in range(self.data.shape[0]):
            text_a = self.data.iloc[temp]['text']
            text_b = None
            label = self.data.iloc[temp]['label']
            examples.append(ClassificationExample(text_a=text_a,text_b=text_b,label=label))

        return examples

    def get_labels(self):
        return [0, 1, 2, 3, 4, 5, 6]
    
    def labels_info(self):
        return self.encoder.classes_

    @property
    def num_labels(self):
        return len(self.get_labels())


In [32]:
from ratsnlp.nlpbook.classification import ClassificationTrainArguments

# model setting
model_path = './kcbert2'
if not os.path.isdir(model_path):
    os.mkdir(model_path)

# arguments
args = ClassificationTrainArguments(
    pretrained_model_name = 'beomi/kcbert-base',
    downstream_corpus_name = 'data2',
    downstream_corpus_root_dir = './',
    downstream_model_dir = model_path,
    batch_size=32 if torch.cuda.is_available() else 4,
    learning_rate = 5e-5,
    max_seq_length = 128,
    seed = 100,
    epochs = 5
)

from ratsnlp import nlpbook
nlpbook.set_seed(args)

set seed: 100


## **2) 토크나이저 불러오기**

In [33]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case = False
)

## **3) 데이터셋 만들기**

In [34]:
# 학습 데이터셋 불러오기
from ratsnlp.nlpbook.classification import NsmcCorpus, ClassificationDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

corpus = CustomCorpus()
train_dataset = ClassificationDataset(
    args = args,
    corpus = corpus,
    tokenizer = tokenizer,
    mode = 'train'
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    sampler=RandomSampler(train_dataset,replacement=False),
    collate_fn = nlpbook.data_collator,
    drop_last = False
)

val_dataset = ClassificationDataset(
    args = args,
    corpus = corpus,
    tokenizer = tokenizer,
    mode = 'val'
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=args.batch_size,
    sampler=SequentialSampler(val_dataset),
    collate_fn = nlpbook.data_collator,
    drop_last = False
)

In [6]:
train_dataset[0]

ClassificationFeatures(input_ids=[2, 25802, 9319, 8099, 347, 9472, 7997, 11247, 26349, 11888, 17, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [7]:
val_dataset[0]

ClassificationFeatures(input_ids=[2, 25802, 9319, 8099, 347, 9472, 7997, 11247, 26349, 11888, 17, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# **4. 모델 학습**

## **1) 모델 생성 및 초기화**

In [35]:
from transformers import BertConfig, BertForSequenceClassification

pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels = corpus.num_labels
)

model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_name,
    config = pretrained_model_config
)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

## **2) TASK 설정**

In [36]:
from ratsnlp.nlpbook.classification import ClassificationTask
task = ClassificationTask(model, args)

## **3) 학습**

In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [38]:
import matplotlib
%matplotlib inline

! pip install tensorboardX



In [41]:
# 인스턴스 생성
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

In [None]:
trainer = nlpbook.get_trainer(args)
trainer.fit(
    task,
    train_dataloaders = train_dataloader,
    val_dataloaders = val_dataloader
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.696   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
# 쓰기 종료
writer.close()

In [None]:
torch.save(model, './model5.pth')

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir='C:/Users/user/metaverse/project7/runs' --host=0.0.0.0 --port 6010

In [60]:
val_data = pd.read_csv('./data2/val_data.csv')

with torch.no_grad():
    total_corr = 0
    cnt = 0
    print('Start!', end=' ')
    for inputs in val_dataloader:
        labels = inputs.pop('labels',None)
        
        outputs = model(**{k: torch.tensor(v) for k,v in inputs.items()})
        prob = outputs.logits.softmax(dim=1)
        _, preds = torch.max(prob, 1)
        
        corr = preds.eq(labels).sum().item()
        total_corr += corr
        
        cnt += 1
        if cnt % 100 == 0:
            print(f'> batch {cnt}', end=' ')
    print('>>> End!')

print(f'Accuracy = {total_corr / val_data.shape[0]}')

Start! > batch 100 > batch 200 > batch 300 > batch 400 > batch 500 > batch 600 > batch 700 > batch 800 > batch 900 > batch 1000 > batch 1100 > batch 1200 > batch 1300 > batch 1400 > batch 1500 > batch 1600 > batch 1700 >>> End!
Accuracy = 0.9928708576304935


In [59]:
print(f'Accuracy = {total_corr / val_data.shape[0]}') # good

Accuracy = 0.9934753235670601


In [61]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda
