## 학습 마친 모델 실전 투입

- 구글 드라이브 연동

In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)  

Mounted at /gdrive


- 의존성 패키지 설치

In [2]:
!pip install ratsnlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ratsnlp
  Downloading ratsnlp-1.0.52-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flask-cors>=3.0.10
  Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting pytorch-lightning==1.6.1
  Downloading pytorch_lightning-1.6.1-py3-none-any.whl (582 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m582.5/582.5 KB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Collecting transformers==4.10.0
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Korpora>=0.2.0
  Downloading Korpora-0.2.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 KB[0m [31m7.8 MB/s

### 인퍼런스 설정

In [3]:
from ratsnlp.nlpbook.qa import QADeployArguments
args = QADeployArguments(
    pretrained_model_name="beomi/kcbert-base",
    downstream_model_dir="/gdrive/MyDrive/nlpbook/checkpoint-qa1",
    max_seq_length=128,
    max_query_length=32,
)

downstream_model_checkpoint_fpath: /gdrive/MyDrive/nlpbook/checkpoint-qa1/epoch=0-val_loss=0.46.ckpt


### 모델 로딩

In [5]:
#체크포인트 로드
import torch
fine_tuned_model_ckpt = torch.load(
    args.downstream_model_checkpoint_fpath,
    map_location=torch.device("cpu")
)
#BERT 설정 로드
from transformers import BertConfig
pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
)
#BERT 모델 초기화
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering(pretrained_model_config)
#체크포인트 주입하기
model.load_state_dict({k.replace("model.", ""): v for k, v in fine_tuned_model_ckpt['state_dict'].items()})
#평가 모드로 전환
model.eval()

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [6]:
#토크나이저 초기화
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name,
    do_lower_case=False,
)

Downloading:   0%|          | 0.00/250k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

### 인퍼런스 함수 선언

- 질문(question)과 지문(context)에 각각 토큰화,인덱싱을 수행한 뒤 input_ids, attention_mask, token_type_ids를 생성
- 이들 입력값을 파이토치 텐서(tensor) 자료형으로 변환한 뒤 모델에 입력

- 모델 출력 값(outputs.logits)은 소프트맥스 함수 적용 이전의 로짓(logit) 형태
  * 로짓에 소프트맥스 함수를 써도 최댓값은 바뀌지 않아 적용 생략

- 마지막으로 모델 출력을 약간 후처리하여 정답 시작 로짓(start_logits)의 최댓값 위치부터 정답 끝 로짓(end_logits)의 최댓값 위치까지의 토큰들을 이어붙여 pred_text를 만듦


In [7]:
def inference_fn(question, context):
    if question and context:
        truncated_query = tokenizer.encode(
            question,
            add_special_tokens=False,
            truncation=True,
            max_length=args.max_query_length
       )    #질문(question)을 토큰화하고 인덱싱하되 max_query_length보다 길면 이에 맞게 자르기
        inputs = tokenizer.encode_plus(
            text=truncated_query,
            text_pair=context,
            truncation="only_second",
            padding="max_length",
            max_length=args.max_seq_length,
            return_token_type_ids=True,
        )   #앞서 처리한 질문(truncated_query)을 지문(context)과 함께 토큰화하고 인덱싱하되 전체 길이가 max_seq_length보다 길면 지문 자르기(truncation='only_second')
        with torch.no_grad():
            outputs = model(**{k: torch.tensor([v]) for k, v in inputs.items()})
            start_pred = outputs.start_logits.argmax(dim=-1).item()  #정답의 시작 위치와 관련된 로짓(outputs.start_logits)에서 가장 큰 값이 가리키는 토큰 위치를 알아내기
            end_pred = outputs.end_logits.argmax(dim=-1).item()   #정답의 끝 위치와 관련된 로짓(outputs.end_logits)에서 가장 큰 값이 가리키는 토큰 위치를 알아내기
            pred_text = tokenizer.decode(inputs['input_ids'][start_pred:end_pred+1])  #정답 시작부터 끝가지의 토큰들을 이어붙여 정답 만들기
    else:
        pred_text = ""
    return {
        'question': question,
        'context': context,
        'answer': pred_text,
    }

In [None]:
question = ""
context = ''
inference_fn(question, context)

### 웹서비스

In [8]:
!mkdir /root/.ngrok2 && echo "authtoken: 2K7HpOJzbZrOEtWlKir6NjCmzOj_7oQuKdBKnQ1fkwDuXUW8o" > /root/.ngrok2/ngrok.yml

In [9]:
!ls /root -al

total 64
drwx------ 1 root root 4096 Jan 12 03:23 .
drwxr-xr-x 1 root root 4096 Jan 12 03:21 ..
-r-xr-xr-x 1 root root 1169 Jan  1  2000 .bashrc
drwxr-xr-x 1 root root 4096 Jan 12 03:23 .cache
drwx------ 1 root root 4096 Jan 12 03:21 .config
drwxr-xr-x 5 root root 4096 Jan 10 14:45 .ipython
drwx------ 2 root root 4096 Jan 10 14:45 .jupyter
drwxr-xr-x 2 root root 4096 Jan 10 14:43 .keras
drwxr-xr-x 1 root root 4096 Jan 10 14:45 .local
drwxr-xr-x 2 root root 4096 Jan 12 03:23 .ngrok2
drwxr-xr-x 4 root root 4096 Jan 10 14:45 .npm
-rw-r--r-- 1 root root  148 Aug 17  2015 .profile
-r-xr-xr-x 1 root root  254 Jan  1  2000 .tmux.conf
-rw-r--r-- 1 root root  165 Jan 10 14:45 .wget-hsts


In [10]:
from ratsnlp.nlpbook.qa import get_web_service_app
app = get_web_service_app(inference_fn)
app.run()

 * Serving Flask app "ratsnlp.nlpbook.qa.deploy" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://06b4-34-125-129-101.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [12/Jan/2023 03:23:37] "[37mGET / HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [12/Jan/2023 03:23:38] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
