In [1]:
# !touch bert_dataset.py
# !touch bert_trainer.py
# !touch trainer.py
# !touch finetune_plm_native.py
# !touch classify_plm.py

## bert_dataset

In [2]:
import torch
from torch.utils.data import Dataset

class TextClassificationCollator():
    def __init__(self, tokenizer, max_length, with_text=True):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.with_text = with_text

    # 매번 데이터로더가 미니배치 사이즈가 128이다. 그러면 128개의 데이터셋에 대해서 getitem 호출한 것을 받아왔다.
    # 받아온것을 concat하면 된다. / 그것을 지금 못하니 call_fn을 부른다.
    # samples에 데이터셋이 리턴한게 리스트로 들어있을 것이다.
    # 즉, 딕셔너리에 리스트가 들어있을 것이다.
    def __call__(self, samples):
        texts = [s['text'] for s in samples]
        labels = [s['label'] for s in samples]
        
        # 토크나이저를 사용한다.
        # __call__이 호출된다. 
        # 토큰갯수 기준으로 미니배치 사이즈는 가변적 대신 미니배치내의 토큰갯수만 바뀜 그러면 메모리는 고정
        # -> 구현 어려움 그래서 미니배치네 가장 긴 기준으로
        encoding = self.tokenizer(
            texts, # text
            padding=True, # 미니배치네 가장 긴 기준으로 패딩을 하기위해 max_length를 getitem에서 안쓰는 거다.
            truncation=True, # max_length 기준으로 잘라냄
            return_tensors="pt", # pytorch type으로 
            max_length=self.max_length 
        )

        return_value = {
            'input_ids' : encoding['input_ids'], # (x,l,1) -> 샘플, 타임스켑, 인덱스
            'attention_mask' : encoding['attention_mask'], # padding된 부분 학습하지 않기 위함
            'labels' : torch.tensor(labels, dtype=torch.long), # 리스트로 있던것을 torch.long 타입의 텐서로 바꿈          
        }
        if self.with_text: # 위 텍스트가 true인 경우에는 return_value['text]에 넣는다.
            return_value['text'] = texts

        return return_value


class TextClassificationDataset(Dataset):

    def __init__(self, texts, labels): # 전체 데이터셋(코퍼스), 각 샘플별 레이블을 리스트로 들고옴
        self.texts = texts
        self.labels = labels

    def __len__(self): # 전체 샘플이 몇개인지
        return len(self.texts)

    # 데이터셋을 데이터 데이터 로더에 넣을 건데 필요할때마다 미니배치를만들어서 메 iteration 리턴을 한다.
    # 미니배치가 128이면 128개의의 데이터셋에 대해서 getitem을 호출한다.
    # 매번 호출할때마다 idx에 있는 아이템들을 리턴해주면 된다.
    # 문장의 길이가 다 다를것이기 때문에 미니 배치내에 가장 긴 문장을 기준으로 패딩을 채워서 리턴한다.
    def __getitem__(self, item): 
        text = str(self.texts[item])
        label = self.labels[item]

        return { # 길이가 모두 같다면 딕셔너리를 안쓰고 그냥 tensor로 리턴하면된다.
            'text' : text,
            'label' : label,
        }

## Typical Model Training Procedure
- **Epoch 시작**
  - Training
     - **Iteration 시작** : 미니배치마다 돈다.
     - Feed-forward : 미니배치를 모델에 통과
     - Loss 계산 : 모델로 y의 헷을 얻게 된다./거기에 y의 확률값 likeihood를 구하여 loss를 계산
     - Back-propagation : loss에 대해서 파라미터로 미분해주는 역전파를 수행/각 layer에 weight parameter에 .grad라는 곳애 채워줌
     - Gradient Descent 수행 : 그것을 활용해서 optimizer에서 optimizer.step식으로 함수를 호출하게 되면 optimizer가 자기가 담당하고 있는 파라미터들을 gradient를 보고 gradient Descent를 1step 수행해주게 된다.
     - 현재 상태 출력 : 현재 loss, gradient 크기, accuracy등을 출력
     - **Iteration 종료** : 한 iteration을 종료하고 다음 iteration을 한다. 모든 미니배치가 끝나면 1epoch가 끝나게 된다.
  - Validation
    - **Iteration 시작** : 1epoch가 끝나면 시작
    - Feed-forward : 모델에 통과
    - Loss 계산 : Loss를 계산하고 역전파, gradent를 할필요 없음음
    - 현재 상태 출력 : 
    - **Iteration 종료**
  - 현재 epoch에서 validation loss가 이전보다 낮아졌는지 오버피팅을 체크
  - 모델이 저장되어야 함
- **Epoch 종료**
- 사용자가 지정한 epoch만큼 반복

## Trainer

In [4]:
!pip install ignite
!pip install pytorch-ignite


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ignite
  Downloading ignite-1.1.0-py2.py3-none-any.whl (4.5 kB)
Installing collected packages: ignite
Successfully installed ignite-1.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-ignite
  Downloading pytorch_ignite-0.4.11-py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.5/266.5 KB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-ignite
Successfully installed pytorch-ignite-0.4.11


In [5]:
from copy import deepcopy

import numpy as np

import torch

from ignite.engine import Engine
from ignite.engine import Events
from ignite.metrics import RunningAverage
from ignite.contrib.handlers.tqdm_logger import ProgressBar

import sys
sys.path.append('/content/drive/MyDrive/인공지능/텍스트분류')
from simple_ntc.utils import get_grad_norm, get_parameter_norm

VERBOSE_SILENT = 0
VERBOSE_EPOCH_WISE = 1
VERBOSE_BATCH_WISE = 2


class MyEngine(Engine):

    def __init__(self, func, model, crit, optimizer, config):
        # Ignite Engine does not have objects in below lines.
        # Thus, we assign class variables to access these object, during the procedure.
        self.model = model
        self.crit = crit
        self.optimizer = optimizer
        self.config = config

        super().__init__(func) # Ignite Engine only needs function to run.

        self.best_loss = np.inf
        self.best_model = None

        self.device = next(model.parameters()).device

    @staticmethod
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train() # Because we assign model as class variable, we can easily access to it.
        engine.optimizer.zero_grad()

        x, y = mini_batch.text, mini_batch.label
        x, y = x.to(engine.device), y.to(engine.device)

        x = x[:, :engine.config.max_length]

        # Take feed-forward
        y_hat = engine.model(x)

        loss = engine.crit(y_hat, y)
        loss.backward()

        # Calculate accuracy only if 'y' is LongTensor,
        # which means that 'y' is one-hot representation.
        if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
            accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
        else:
            accuracy = 0

        p_norm = float(get_parameter_norm(engine.model.parameters()))
        g_norm = float(get_grad_norm(engine.model.parameters()))

        # Take a step of gradient descent.
        engine.optimizer.step()

        return {
            'loss': float(loss),
            'accuracy': float(accuracy),
            '|param|': p_norm,
            '|g_param|': g_norm,
        }

    @staticmethod
    def validate(engine, mini_batch):
        engine.model.eval()

        with torch.no_grad():
            x, y = mini_batch.text, mini_batch.label
            x, y = x.to(engine.device), y.to(engine.device)

            x = x[:, :engine.config.max_length]

            y_hat = engine.model(x)

            loss = engine.crit(y_hat, y)

            if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
                accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
            else:
                accuracy = 0

        return {
            'loss': float(loss),
            'accuracy': float(accuracy),
        }

    @staticmethod
    def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE):
        # Attaching would be repaeted for serveral metrics.
        # Thus, we can reduce the repeated codes by using this function.
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach( # engine에 해당 metric_name으로 attach한다.
                engine,
                metric_name,
            )

        training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|']

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        # If the verbosity is set, progress bar would be shown for mini-batch iterations.
        # Without ignite, you can use tqdm to implement progress bar.
        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120) # 프로그래스바 attach
            pbar.attach(train_engine, training_metric_names)

        # If the verbosity is set, statistics would be shown after each epoch.
        # train engine에 epoch가 끝났을때 train_engin에 등록해서 print_train_logs()를 실행해서 해당하는 최종값을 받아와라라
        if verbose >= VERBOSE_EPOCH_WISE: 
            @train_engine.on(Events.EPOCH_COMPLETED)
            def print_train_logs(engine):
                print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}'.format(
                    engine.state.epoch,
                    engine.state.metrics['|param|'],
                    engine.state.metrics['|g_param|'],
                    engine.state.metrics['loss'],
                    engine.state.metrics['accuracy'],
                ))

        # validation도 마찬가지로
        validation_metric_names = ['loss', 'accuracy']
        
        for metric_name in validation_metric_names:
            attach_running_average(validation_engine, metric_name)

        # Do same things for validation engine.
        if verbose >= VERBOSE_BATCH_WISE:
            pbar = ProgressBar(bar_format=None, ncols=120)
            pbar.attach(validation_engine, validation_metric_names)

        if verbose >= VERBOSE_EPOCH_WISE:
            @validation_engine.on(Events.EPOCH_COMPLETED)
            def print_valid_logs(engine):
                print('Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}'.format(
                    engine.state.metrics['loss'],
                    engine.state.metrics['accuracy'],
                    engine.best_loss,
                ))

    @staticmethod
    def check_best(engine):
        loss = float(engine.state.metrics['loss'])
        if loss <= engine.best_loss: # If current epoch returns lower validation loss,
            engine.best_loss = loss  # Update lowest validation loss.
            engine.best_model = deepcopy(engine.model.state_dict()) # Update best model weights.

    @staticmethod
    def save_model(engine, train_engine, config, **kwargs):
        torch.save(
            {
                'model': engine.best_model,
                'config': config,
                **kwargs
            }, config.model_fn
        )


class Trainer():

    def __init__(self, config):
        self.config = config

    def train(
        self,
        model, crit, optimizer,
        train_loader, valid_loader,
    ):
        train_engine = MyEngine(
            MyEngine.train,
            model, crit, optimizer, self.config
        )
        validation_engine = MyEngine(
            MyEngine.validate,
            model, crit, optimizer, self.config
        )

        MyEngine.attach(
            train_engine,
            validation_engine,
            verbose=self.config.verbose
        )

        def run_validation(engine, validation_engine, valid_loader):
            validation_engine.run(valid_loader, max_epochs=1)

        train_engine.add_event_handler(
            Events.EPOCH_COMPLETED, # event
            run_validation, # function
            validation_engine, valid_loader, # arguments
        )
        validation_engine.add_event_handler(
            Events.EPOCH_COMPLETED, # event
            MyEngine.check_best, # function
        )

        train_engine.run(
            train_loader,
            max_epochs=self.config.n_epochs,
        )

        model.load_state_dict(validation_engine.best_model)

        return model

## bert_trainer

In [6]:
import torch
import torch.nn.utils as torch_utils

from ignite.engine import Events

import sys
sys.path.append('/content/drive/MyDrive/인공지능/텍스트분류')
from simple_ntc.utils import get_grad_norm, get_parameter_norm

VERBOSE_SILENT = 0
VERBOSE_EPOCH_WISE = 1
VERBOSE_BATCH_WISE = 2

from simple_ntc.trainer import Trainer, MyEngine


class EngineForBert(MyEngine):

    def __init__(self, func, model, crit, optimizer, scheduler, config):
        self.scheduler = scheduler

        super().__init__(func, model, crit, optimizer, config)

    @staticmethod
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train() # Because we assign model as class variable, we can easily access to it.
        engine.optimizer.zero_grad()

        x, y = mini_batch['input_ids'], mini_batch['labels']
        x, y = x.to(engine.device), y.to(engine.device) # gpu로 옮김김
        mask = mini_batch['attention_mask']
        mask = mask.to(engine.device) # gpu로 옮김김

        x = x[:, :engine.config.max_length] # n.l,1 : ㅣ차원에 대해서 잘라서 슬라이싱 한다.

        # Take feed-forward
        y_hat = engine.model(x, attention_mask=mask).logits # .logits==hidden state==softmax 넣기 직전값, linear layer통과해 차원축소함함
        # y_hat : (n,|c|)

        loss = engine.crit(y_hat, y) #crossentropy를 통과시키면 loss가 나온다.
        loss.backward() # loss를 미분해서 역전파함

        # Calculate accuracy only if 'y' is LongTensor,
        # which means that 'y' is one-hot representation.
        if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
            accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
        else:
            accuracy = 0

        p_norm = float(get_parameter_norm(engine.model.parameters())) # parameter의 L2_norm
        g_norm = float(get_grad_norm(engine.model.parameters())) # gradient의 L2_norm

        # Take a step of gradient descent.
        engine.optimizer.step() # step을 먹여준다.gradient desent하여 한 스텝을 파라미터에 업데이트
        engine.scheduler.step()

        return {
            'loss': float(loss),
            'accuracy': float(accuracy),
            '|param|': p_norm,
            '|g_param|': g_norm,
        }

    @staticmethod
    def validate(engine, mini_batch):
        engine.model.eval()

        with torch.no_grad(): # grad계산할 필요가 없음/ 메모리를 작게 빠르게게
            x, y = mini_batch['input_ids'], mini_batch['labels']
            x, y = x.to(engine.device), y.to(engine.device)
            mask = mini_batch['attention_mask']
            mask = mask.to(engine.device)

            x = x[:, :engine.config.max_length]

            # Take feed-forward
            y_hat = engine.model(x, attention_mask=mask).logits

            loss = engine.crit(y_hat, y)

            if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
                accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
            else:
                accuracy = 0

        return {
            'loss': float(loss),
            'accuracy': float(accuracy),
        }


class BertTrainer(Trainer):

    def __init__(self, config): # 학습을 위한 하이퍼파라미터라 들어있는 config를 가져옴
        self.config = config

    def train( # 학습할때 모델, loss함수, optimizer...를 받아온다.
        self,
        model, crit, optimizer, scheduler,
        train_loader, valid_loader,
    ):
        train_engine = EngineForBert(
            EngineForBert.train,
            model, crit, optimizer, scheduler, self.config
        )
        validation_engine = EngineForBert(
            EngineForBert.validate,
            model, crit, optimizer, scheduler, self.config
        )

        # trainer.py에 선언되어있음/ 현재상태 출력을 위한 것을 등록
        # train_engine과 validation_engine의 현재 상태를 출력력
        EngineForBert.attach( 
            train_engine,
            validation_engine,
            verbose=self.config.verbose
        )

        # 학습이 끝나고 validation을 실행하도록 
        # 실행하는 함수를 만들고 train에 등록록
        def run_validation(engine, validation_engine, valid_loader):
            validation_engine.run(valid_loader, max_epochs=1)

        train_engine.add_event_handler(
            Events.EPOCH_COMPLETED, # event
            run_validation, # function
            validation_engine, valid_loader, # arguments
        )
        # best loss 여부체크 및 모델 저장장
        validation_engine.add_event_handler(
            Events.EPOCH_COMPLETED, # event
            EngineForBert.check_best, # function
        )

        # train engine 실행 train_loader를 넣고 몇 epoch를 돌릴것인지 지정
        train_engine.run(
            train_loader,
            max_epochs=self.config.n_epochs,
        )

        # 다 끝나면 베스트 모델을 불러온다음에 return하면 학습이 종료된다.
        model.load_state_dict(validation_engine.best_model)

        return model

## finetune_plm_native.py

In [7]:
!pip install transformers
!pip install torch_optimizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m100.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2
Looking in indexes: https://pypi.org/simple, https://u

In [8]:
# import argparse
# import random

# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader

# from transformers import BertTokenizerFast
# from transformers import BertForSequenceClassification, AlbertForSequenceClassification
# from transformers import AdamW
# from transformers import get_linear_schedule_with_warmup

# import torch_optimizer as custom_optim

# import sys 
# sys.path.append('/content/drive/MyDrive/인공지능/텍스트분류')
# from simple_ntc.bert_trainer import BertTrainer as Trainer
# from simple_ntc.bert_dataset import TextClassificationDataset, TextClassificationCollator
# from simple_ntc.utils import read_text

# def define_argparser():
#     p = argparse.ArgumentParser()

#     p.add_argument('--model_fn', required=True)
#     p.add_argument('--train_fn', required=True)
#     # Recommended model list:
#     # - kykim/bert-kor-base
#     # - kykim/albert-kor-base
#     # - beomi/kcbert-base
#     # - beomi/kcbert-large
#     p.add_argument('--pretrained_model_name', type=str, default='beomi/kcbert-base')
#     p.add_argument('--use_albert', action='store_true')
    
#     p.add_argument('--gpu_id', type=int, default=-1)
#     p.add_argument('--verbose', type=int, default=2)

#     p.add_argument('--batch_size', type=int, default=32)
#     p.add_argument('--n_epochs', type=int, default=5)

#     p.add_argument('--lr', type=float, default=5e-5) # warmup이 끝났을때 lr이다.
#     p.add_argument('--warmup_ratio', type=float, default=.2) # 트랜스포머가 학습이 까다로움/그냥 adam쓰면 성능이 잘 안나옴 
#     p.add_argument('--adam_epsilon', type=float, default=1e-8)
#     # If you want to use RAdam, I recommend to use LR=1e-4.
#     # Also, you can set warmup_ratio=0.
#     p.add_argument('--use_radam', action='store_true') # warmup안하고 하는 방법 연구 이것을 쓸대의 인자는 바로 위에 2개임임
#     p.add_argument('--valid_ratio', type=float, default=.2)

#     p.add_argument('--max_length', type=int, default=100)

#     config = p.parse_args()

#     return config


# def get_loaders(fn, tokenizer, valid_ratio=.2):
#     # Get list of labels and list of texts.
#     labels, texts = read_text(fn)

#     # Generate label to index map.
#     unique_labels = list(set(labels)) # 유니크한 레이블로 만든다.
#     label_to_index = {}
#     index_to_label = {}
#     for i, label in enumerate(unique_labels): # 유니크레이블을 돌면서 매핑
#         label_to_index[label] = i
#         index_to_label[i] = label

#     # Convert label text to integer value.
#     # 텍스트를 index로 변환해 나온 결과를 적용하면 interger의 리스트가 된다.
#     labels = list(map(label_to_index.get, labels))

#     # Shuffle before split into train and validation set.
#     # shuffle을 해서 train과 vali를 나눈다.
#     shuffled = list(zip(texts, labels)) # zip해논 상태에서 shuffled 해야한다.
#     random.shuffle(shuffled)
#     texts = [e[0] for e in shuffled]
#     labels = [e[1] for e in shuffled]
#     idx = int(len(texts) * (1 - valid_ratio))

#     # Get dataloaders using given tokenizer as collate_fn.
#     # 데이터로더가 나온다. train이니가 shuffle해야한다. val은 안한다.
#     train_loader = DataLoader(
#         TextClassificationDataset(texts[:idx], labels[:idx]),
#         batch_size=config.batch_size,
#         shuffle=True,
#         collate_fn=TextClassificationCollator(tokenizer, config.max_length),
#     )
#     valid_loader = DataLoader(
#         TextClassificationDataset(texts[idx:], labels[idx:]),
#         batch_size=config.batch_size,
#         collate_fn=TextClassificationCollator(tokenizer, config.max_length),
#     )

#     return train_loader, valid_loader, index_to_label


# def get_optimizer(model, config):
#     if config.use_radam:
#         optimizer = custom_optim.RAdam(model.parameters(), lr=config.lr)
#     else:
#         # Prepare optimizer and schedule (linear warmup and decay)
#         no_decay = ['bias', 'LayerNorm.weight']
#         optimizer_grouped_parameters = [
#             {
#                 'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
#                 'weight_decay': 0.01
#             },
#             {
#                 'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
#                 'weight_decay': 0.0
#             }
#         ]

#         optimizer = optim.AdamW( # 웬만해서는 default값 사용
#             optimizer_grouped_parameters,
#             lr=config.lr,
#             eps=config.adam_epsilon
#         )

#     return optimizer


# def main(config):
#     # Get pretrained tokenizer.
#     tokenizer = BertTokenizerFast.from_pretrained(config.pretrained_model_name)
#     # Get dataloaders using tokenizer from untokenized corpus.
#     train_loader, valid_loader, index_to_label = get_loaders( # idnex_to_label은 추론할때 필요한 정보보
#         config.train_fn,
#         tokenizer,
#         valid_ratio=config.valid_ratio
#     )
#     # 몇 개인지 확인인
#     print(
#         '|train| =', len(train_loader) * config.batch_size,
#         '|valid| =', len(valid_loader) * config.batch_size,
#     )

#     # warmup
#     # adam은 고정 lr이다. 이렇게 하면 transformer가 학습이 잘안됨
#     # 그래서 warmup을 한다. adam이 처음부터 잘동작한다.
#     # 그러나 처음 들어오는 샘플들이 noise할 수 있다. 그걸로 모멘텀을 잘 못 배워서 날라가버리는 현상발생
#     # 초반에 네트워크가 안정되기 전까지 많이 배우지 말고 warmup을 해라는 것이다.    
#     n_total_iterations = len(train_loader) * config.n_epochs # 미니배치수 X epoch수로 iteration을 지정
#     n_warmup_steps = int(n_total_iterations * config.warmup_ratio) # 400의 20%면 80까지는 warmup한다.
#     print(
#         '#total_iters =', n_total_iterations,
#         '#warmup_iters =', n_warmup_steps,
#     )

#     # 모델 선언
#     # Get pretrained model with specified softmax layer.
#     model_loader = AlbertForSequenceClassification if config.use_albert else BertForSequenceClassification
#     model = model_loader.from_pretrained(
#         config.pretrained_model_name, # 사전 학습된 weight가 로딩이 됨,
#         num_labels=len(index_to_label) # 다만 맨위에 있는 linear layer는 random 초기화되어 있다.
#     )
#     optimizer = get_optimizer(model, config)

#     # By default, model returns a hidden representation before softmax func.
#     # Thus, we need to use CrossEntropyLoss, which combines LogSoftmax and NLLLoss.
#     # 소프트맥스 직전의 hidden_referengentation...값을 loss에 집어넣으면 된다.
#     # 그것을 logits이라고 한다. 
#     # 그리고 linear 스케줄 warmup
#     crit = nn.CrossEntropyLoss()    
#     scheduler = get_linear_schedule_with_warmup(
#         optimizer,
#         n_warmup_steps,
#         n_total_iterations
#     )

#     # gpu로 옮김김
#     if config.gpu_id >= 0:
#         model.cuda(config.gpu_id)
#         crit.cuda(config.gpu_id)

#     # Start train.
#     trainer = Trainer(config)
#     model = trainer.train(
#         model,
#         crit,
#         optimizer,
#         scheduler,
#         train_loader,
#         valid_loader,
#     )

#     torch.save({
#         'rnn': None,
#         'cnn': None,
#         'bert': model.state_dict(),
#         'config': config, # 나중에 불러올 때 어떤 hp인지 알아야함함
#         'vocab': None,
#         'classes': index_to_label,
#         'tokenizer': tokenizer,
#     }, config.model_fn)

# # 실행을 하면 여기로 간다.
# # hyper parameter를 여기에 입력받게 된다.
# if __name__ == '__main__':
#     config = define_argparser()
#     main(config)

## classify_plm.py

In [9]:
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
# import sys
# import argparse

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torchtext import data

# from transformers import BertTokenizerFast
# from transformers import BertForSequenceClassification, AlbertForSequenceClassification


# def define_argparser():
#     '''
#     Define argument parser to take inference using pre-trained model.
#     '''
#     p = argparse.ArgumentParser()

#     p.add_argument('--model_fn', required=True)
#     p.add_argument('--gpu_id', type=int, default=-1)
#     p.add_argument('--batch_size', type=int, default=256)
#     p.add_argument('--top_k', type=int, default=1)

#     config = p.parse_args()

#     return   


# def read_text():
#     '''
#     Read text from standard input for inference.
#     '''
#     lines = []

#     for line in sys.stdin:
#         if line.strip() != '':
#             lines += [line.strip()]

#     return lines


# def main(config):
#     saved_data = torch.load( # 저장된 모델을 불러옴
#         config.model_fn,
#         map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id # 원하는 디바이스에 로딩되도록록
#     )

#     train_config = saved_data['config']
#     bert_best = saved_data['bert']
#     index_to_label = saved_data['classes']

#     lines = read_text()

#     with torch.no_grad():
#         # Declare model and load pre-trained weights.
#         tokenizer = BertTokenizerFast.from_pretrained(train_config.pretrained_model_name)
#         model_loader = AlbertForSequenceClassification if train_config.use_albert else BertForSequenceClassification
#         model = model_loader.from_pretrained(
#             train_config.pretrained_model_name,
#             num_labels=len(index_to_label)
#         )
#         model.load_state_dict(bert_best) # fine-tuning한 파라미터를 로드한다.

#         if config.gpu_id >= 0:
#             model.cuda(config.gpu_id)
#         device = next(model.parameters()).device # 모델의 첫번째 파라미터의 디바이스를 보면 어느 디바이스에 올랐는지 알 수 있음

#         # Don't forget turn-on evaluation mode.
#         model.eval()

#         y_hats = []
#         for idx in range(0, len(lines), config.batch_size): # 전체에 대해 batch_size만큼 점프하면서 인덱스를 맏아온다.
#             mini_batch = tokenizer(
#                 lines[idx:idx + config.batch_size],#lines에서 indx부터 그 다음 batch_size까지 받아옴
#                 padding=True,
#                 truncation=True,
#                 return_tensors="pt",
#             )

#             x = mini_batch['input_ids']
#             x = x.to(device)
#             mask = mini_batch['attention_mask']
#             mask = mask.to(device)

#             # Take feed-forward
#             # model(x, attention_mask=mask) : (n,1,|c|) or (n,|c|)
#             #  F.softmax 확률값 구하기 위함
#             # dim = -1를 해야지 |c|에 대해서 softmax를 구한다.
#             # 같은 크기지만 각 미니배치별 샘플별 클래스가 들어잇는 확률을 구하게 된다.
#             y_hat = F.softmax(model(x, attention_mask=mask).logits, dim=-1) 

#             # y_hats에 쌓는다.
#             y_hats += [y_hat]
#         # Concatenate the mini-batch wise result
#         # (n,|c|) X mini_batch 갯수
#         # 이것을 다 합쳐야 된다.
#         y_hats = torch.cat(y_hats, dim=0)
#         # |y_hats| = (len(lines), n_classes)

#         probs, indice = y_hats.cpu().topk(config.top_k)
#         # |indice| = (len(lines), top_k)

#         # 화면에 출력
#         for i in range(len(lines)):
#             sys.stdout.write('%s\t%s\n' % (
#                 ' '.join([index_to_label[int(indice[i][j])] for j in range(config.top_k)]), 
#                 lines[i]
#             ))


# if __name__ == '__main__':
#     config = define_argparser()
#     main(config)

## 학습 실행

In [11]:
%cd /content/drive/MyDrive/인공지능/텍스트분류

/content/drive/MyDrive/인공지능/텍스트분류


In [14]:
!python finetune_plm_native.py --model_fn ./models/review.native.kcbert.pth --train_fn ./data/review.sorted.uniq.refined.shuf.train.tsv --gpu_id 0 --batch_size 80 --n_epochs 2 --pretrained_model_name 'beomi/kcbert-base'

2023-03-22 05:49:37.814965: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-22 05:49:37.985528: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-22 05:49:38.838071: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-03-22 05:49:38.838183: W tensorflow/compiler/xla/stream_executor