#### 패키지 설치

In [36]:
!pip install ratsnlp -q

In [37]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True)
# 구글 드라이브 설정

Mounted at /gdrive


#### 모델 환경설정
- 하이퍼파라미터와 저장 위치 등 설정 정보 선언

In [38]:
import torch
from ratsnlp.nlpbook.classification import ClassificationTrainArguments

args = ClassificationTrainArguments(
    pretrained_model_name= "beomi/kcbert-base",# 프리트레인 마친 언어 모델의 이름
    downstream_corpus_name="nsmc", # 다운스트림 데이터의 이름
    downstream_model_dir="/gdrive/My Drive/nlpbook/checkpoint-doccls",# 파인튜닝된 모델의 체크포인트가 저장될 위치.
    batch_size = 32 if torch.cuda.is_available() else 4,
    learning_rate = 5e-5,
    max_seq_length = 128,
    epochs = 3,
    tpu_cores = 0 if torch.cuda.is_available() else 8,
    seed = 7
)

#### 랜덤시드 고정
- 학습 재현을 위해 랜덤 시드를 고정합니다.


In [39]:
from ratsnlp import nlpbook

nlpbook.set_seed(args)

set seed: 7


#### 로거 설정 
- 메세지 출력 등을 위한 logger를 설정합니다.

In [40]:
nlpbook.set_logger(args) 

INFO:ratsnlp:Training/evaluation parameters ClassificationTrainArguments(pretrained_model_name='beomi/kcbert-base', downstream_task_name='document-classification', downstream_corpus_name='nsmc', downstream_corpus_root_dir='/content/Korpora', downstream_model_dir='/gdrive/My Drive/nlpbook/checkpoint-doccls', max_seq_length=128, save_top_k=1, monitor='min val_loss', seed=7, overwrite_cache=False, force_download=False, test_mode=False, learning_rate=5e-05, epochs=3, batch_size=4, cpu_workers=2, fp16=False, tpu_cores=8)


#### 말뭉치 내려받기

In [41]:
from Korpora import Korpora

Korpora.fetch(
    corpus_name = args.downstream_corpus_name,
    root_dir = args.downstream_corpus_root_dir,
    force_download=True,
)

[nsmc] download ratings_train.txt: 14.6MB [00:00, 63.2MB/s]                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 33.2MB/s]                           


#### 토크나이저 준비하기

In [42]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case = False
)

#### 데이터 전처리하기

In [43]:
from ratsnlp.nlpbook.classification import NsmcCorpus, ClassificationDataset

corpus = NsmcCorpus()
train_dataset = ClassificationDataset(
    args = args,
    corpus = corpus,
    tokenizer = tokenizer,
    mode = "train"
)

INFO:ratsnlp:Loading features from cached file /content/Korpora/nsmc/cached_train_BertTokenizer_128_nsmc_document-classification [took 23.259 s]


- ClassificationDataset : NsmcCorpus와 토크나이저를 품고 있습니다.
   - NsmcCorpus가 넘겨준 문장과 레이블 각각 토크나이저를 활용해 모델 학습할 수 있는 형태로 가공
- NsmcCorpus 는 csv파일 형식의 NSCM의 데이터를 문장과 레이블로 읽어 들입니다.

- Classification Features 자료형
   - input_ids : 인덱스로 변환된 토큰 시퀀스
   - attention_mask : 해당토큰이 패딩토큰(0)인지 아닌지(1) 나타냄
   - token_type_ids : 세그먼트 정보
   - label : 정수로 바뀐 레이블 정보

#### 학습데이터 로더

In [44]:
from torch.utils.data import DataLoader, RandomSampler

train_dataloader = DataLoader(
    train_dataset,
    batch_size = args.batch_size,
    sampler = RandomSampler(train_dataset, replacement = False),
    collate_fn = nlpbook.data_collator,
    drop_last = False,
    num_workers = args.cpu_workers
)

- sampler : 샘플링 방식을 정의합니다.
- collate_fn : 배치 사이즈 만큼 비복원 랜덤 추출한 인스턴스들을 배치로 만드는 역할


In [45]:
train_dataset[0]

ClassificationFeatures(input_ids=[2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#### 평가용 데이터 로더 구축

In [46]:
from torch.utils.data import SequentialSampler

val_dataset = ClassificationDataset(
    args = args,
    corpus = corpus,
    tokenizer = tokenizer,
    mode = "test"
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=args.batch_size,
    sampler=SequentialSampler(val_dataset),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)


INFO:ratsnlp:Loading features from cached file /content/Korpora/nsmc/cached_test_BertTokenizer_128_nsmc_document-classification [took 8.239 s]


#### 모델 초기화

In [47]:
from transformers import BertConfig, BertForSequenceClassification

pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels = corpus.num_labels
)

model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_name,
    config = pretrained_model_config
)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

#### task 정의


In [48]:
from ratsnlp.nlpbook.classification import ClassificationTask
task = ClassificationTask(model, args)

- 옵티마이저 : 아담 Adam
- 러닝 레이트 스케줄러 : ExponentialLR
   - 현재 에포크의 러닝 레이트 : 이전 에포크의 러닝 레이트 X gamma(0.9)

#### 학습

In [49]:
trainer = nlpbook.get_trainer(args)

MisconfigurationException: ignored

In [None]:
trainer.fit(
    task,
    train_dataloaders = train_dataloader,
    val_dataloaders = val_dataloader)

## 실전 투입


#### 각종 설정
- 모델 하이퍼파라메터(hyperparameter)와 저장 위치 등 설정 정보를 선언합니다.

In [None]:
from ratsnlp.nlpbook.classification import ClassificationDeployArguments

args = ClassificationDeployArguments(
    pretrained_model_name = "beomi/kcbert-base",
    downstream_model_dir="/gdrive/MyDrive/nlpbook/checkpoint-doccls",
    max_seq_length = 128
)

#### 토크나이저 로드

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case = False
)

#### 체크포인트 로드

In [None]:
import torch

fine_tuned_model_ckpt = torch.load(
    args.downstream_model_checkpoint_fpath,
    map_location = torch.device("cpu")
)

pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels=fine_tuned_model_ckpt['state_dict']['model.classifier.bias'].shape.numel(),

)
#Bert 설정 로드

from transformers import BertConfig, BertForSequenceClassification
model = BertForSequenceClassification(pretrained_model_config)
#Bert 모델 초기화

model.load_state_dict({k.replace("model.", ""): v for k, v in fine_tuned_model_ckpt['state_dict'].items()})
#체크포인트 주입하기

model.eval()
#평가모드로 전환

#### 인퍼런스 함수 선언

In [None]:
def inference_fn(sentence):
    inputs = tokenizer(
        [sentence],
        max_length=args.max_seq_length,
        padding="max_length",
        truncation=True,
    )
    with torch.no_grad():
        outputs = model(**{k: torch.tensor(v) for k, v in inputs.items()})
        prob = outputs.logits.softmax(dim=1)
        positive_prob = round(prob[0][1].item(), 4)
        negative_prob = round(prob[0][0].item(), 4)
        pred = "긍정 (positive)" if torch.argmax(prob) == 1 else "부정 (negative)"
    return {
        'sentence': sentence,
        'prediction': pred,
        'positive_data': f"긍정 {positive_prob}",
        'negative_data': f"부정 {negative_prob}",
        'positive_width': f"{positive_prob * 100}%",
        'negative_width': f"{negative_prob * 100}%",
    }

#### 웹 서비스 시작하기

In [None]:
from ratsnlp.nlpbook.classification import get_web_service_app
app = get_web_service_app(inference_fn) # inference_fn : 플라스크라는 파이썬 라이브러리의 도움을 받아 웹 서비스 o
app.run()