In [12]:
import argparse

import pandas as pd

from tqdm.auto import tqdm

import transformers
import torch
import torchmetrics
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, Callback

from optuna.integration import PyTorchLightningPruningCallback
from optuna.visualization import (plot_intermediate_values, 
                                  plot_optimization_history,
                                  plot_param_importances)

import optuna

In [2]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets=[]):
        self.inputs = inputs
        self.targets = targets

    # 학습 및 추론 과정에서 데이터를 1개씩 꺼내오는 곳
    def __getitem__(self, idx):
        # 정답이 있다면 else문을, 없다면 if문을 수행합니다
        if len(self.targets) == 0:
            return torch.tensor(self.inputs[idx])
        else:
            return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

    # 입력하는 개수만큼 데이터를 사용합니다
    def __len__(self):
        return len(self.inputs)

In [3]:
class Dataloader(pl.LightningDataModule):
    def __init__(self, model_name, batch_size, shuffle, weight_decay, train_path, dev_path, test_path, predict_path):
        super().__init__()
        self.model_name = model_name
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.weight_decay = weight_decay

        self.train_path = train_path
        self.dev_path = dev_path
        self.test_path = test_path
        self.predict_path = predict_path

        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
        self.predict_dataset = None

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, max_length=160)
        self.target_columns = ['label']
        self.delete_columns = ['id']
        self.text_columns = ['sentence_1', 'sentence_2']

    def tokenizing(self, dataframe):
        data = []
        for idx, item in tqdm(dataframe.iterrows(), desc='tokenizing', total=len(dataframe)):
            # 두 입력 문장을 [SEP] 토큰으로 이어붙여서 전처리합니다.
            text = '[SEP]'.join([item[text_column] for text_column in self.text_columns])
            outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True)
            data.append(outputs['input_ids'])
        return data

    def preprocessing(self, data):
        # 안쓰는 컬럼을 삭제합니다.
        data = data.drop(columns=self.delete_columns)

        # 타겟 데이터가 없으면 빈 배열을 리턴합니다.
        try:
            targets = data[self.target_columns].values.tolist()
        except:
            targets = []
        # 텍스트 데이터를 전처리합니다.
        inputs = self.tokenizing(data)

        return inputs, targets

    def setup(self, stage='fit'):
        if stage == 'fit':
            # 학습 데이터와 검증 데이터셋을 호출합니다
            train_data = pd.read_csv(self.train_path)
            val_data = pd.read_csv(self.dev_path)

            # 학습데이터 준비
            train_inputs, train_targets = self.preprocessing(train_data)

            # 검증데이터 준비
            val_inputs, val_targets = self.preprocessing(val_data)

            # train 데이터만 shuffle을 적용해줍니다, 필요하다면 val, test 데이터에도 shuffle을 적용할 수 있습니다
            self.train_dataset = Dataset(train_inputs, train_targets)
            self.val_dataset = Dataset(val_inputs, val_targets)
        else:
            # 평가데이터 준비
            test_data = pd.read_csv(self.test_path)
            test_inputs, test_targets = self.preprocessing(test_data)
            self.test_dataset = Dataset(test_inputs, test_targets)

            predict_data = pd.read_csv(self.predict_path)
            predict_inputs, predict_targets = self.preprocessing(predict_data)
            self.predict_dataset = Dataset(predict_inputs, [])

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=self.shuffle)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset, batch_size=self.batch_size)

In [4]:
class Model(pl.LightningModule):
    def __init__(self, model_name, lr):
        super().__init__()
        self.save_hyperparameters()

        self.model_name = model_name
        self.lr = lr

        # 사용할 모델을 호출합니다.
        self.plm = transformers.AutoModelForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_name, num_labels=1)
        # Loss 계산을 위해 사용될 L1Loss를 호출합니다.
        self.loss_func = torch.nn.L1Loss()

    def forward(self, x):
        x = self.plm(x)['logits']

        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_func(logits, y.float())
        self.log("train_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_func(logits, y.float())
        self.log("val_loss", loss)

        self.log("val_pearson", torchmetrics.functional.pearson_corrcoef(logits.squeeze(), y.squeeze()))

        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)

        self.log("test_pearson", torchmetrics.functional.pearson_corrcoef(logits.squeeze(), y.squeeze()))

    def predict_step(self, batch, batch_idx):
        x = batch
        logits = self(x)

        return logits.squeeze()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer

In [39]:
def objective(trial):
    # 하이퍼 파라미터 값 탐색 대상 설정
    batch_size = trial.suggest_categorical("batch_size", [16, 32])
    max_epoch = trial.suggest_int("max_epoch", 1, 1)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-5)
    weight_decay = trial.suggest_float("weight_decay", 1e-2, 1e-2)
    # gradient_clip_val = trial.suggest_float("gradient_clip_val", 0, .5)

    # 하이퍼 파라미터 설정
    folder_path = '~/level1_semantictextsimilarity-nlp-14/SH'
    args = argparse.Namespace(
        model_name='klue/roberta-small',
        batch_size=batch_size,
        max_epoch=max_epoch,
        shuffle=True,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        train_path=folder_path+'/data/train.csv',
        dev_path=folder_path+'/data/dev.csv',
        test_path=folder_path+'/data/dev.csv',
        predict_path=folder_path+'/data/test.csv'
    )

    # dataloader와 model을 생성합니다.
    dataloader = Dataloader(args.model_name, 
                            args.batch_size, 
                            args.shuffle,
                            args.weight_decay,
                            args.train_path, 
                            args.dev_path, 
                            args.test_path, 
                            args.predict_path)
    model = Model(args.model_name, args.learning_rate)

    # gpu가 없으면 accelerator='cpu', 있으면 accelerator='gpu'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    accelerator = 'gpu' if device == torch.device('cuda') else 'cpu'
    checkpoints_filename = model_name = f"./checkpoints/{args.model_name}_bs{args.batch_size}_me{args.max_epoch}_lr{args.learning_rate}_wd{args.weight_decay}"
    checkpoints_path = folder_path + '/checkpoitns'
    checkpoint_save = ModelCheckpoint(dirpath=checkpoints_path, filename=checkpoints_filename)
    trainer = pl.Trainer(accelerator=accelerator, 
                         max_epochs=args.max_epoch, 
                         log_every_n_steps=1, 
                         # gradient_clip_val=gradient_clip_val,
                         callbacks=[PyTorchLightningPruningCallback(trial, monitor=monitor), checkpoint_save])

    # Train part
    trainer.fit(model=model, datamodule=dataloader)
    trainer.test(model=model, datamodule=dataloader)
    
    metric = trainer.callback_metrics['val_loss'].item()
    
    return metric

In [40]:
if __name__ == '__main__':
    pl.seed_everything(970514, workers=True)
    monitor = 'val_loss'
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=3)

    # 하이퍼파라미터 최적화 결과 출력
    print('Number of finished trials: ', len(study.trials))
    print('Best trial:')
    trial = study.best_trial
    print('  Value: {}'.format(trial.value))
    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))
    plot_optimization_history(study).show()
    plot_intermediate_values(study).show()
    try:
        plot_param_importances(study).show()
    except ValueError:
        pass

    # 최적의 하이퍼파라미터로 모델 생성
    folder_path = '~/level1_semantictextsimilarity-nlp-14/SH'
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', default='klue/roberta-small', type=str)
    parser.add_argument('--shuffle', default=True)
    parser.add_argument('--train_path', default=folder_path+'/data/train.csv')
    parser.add_argument('--dev_path', default=folder_path+'/data/dev.csv')
    parser.add_argument('--test_path', default=folder_path+'/data/dev.csv')
    parser.add_argument('--predict_path', default=folder_path+'/data/test.csv')
    args_parser = parser.parse_args()
    
    args = argparse.Namespace(
        model_name=args_parser.model_name,
        batch_size=trial.params['batch_size'],
        max_epoch=trial.params['max_epoch'],
        shuffle=args_parser.shuffle,
        learning_rate=trial.params['learning_rate'],
        weight_decay=trial.params['weight_decay'],
        train_path=args_parser.train_path,
        dev_path=args_parser.dev_path,
        test_path=args_parser.test_path,
        predict_path=args_parser.predict_path,
    )
    dataloader = Dataloader(args.model_name, args.batch_size, args.shuffle, args.weight_decay, args.train_path, args.dev_path,
                            args.test_path, args.predict_path)
    model = Model(args.model_name, args.learning_rate)
    
    trainer.fit(model=model, datamodule=dataloader)
    
    model_name = f"./models/{args.model_name}_bs{args.batch_size}_me{args.max_epoch}_lr{args.learning_rate}_wd{args.weight_decay}.pt"
    torch.save(model, model_name)

Global seed set to 970514
[32m[I 2023-04-11 16:05:54,085][0m A new study created in memory with name: no-name-ddfb373b-f63d-4411-bf57-415167fe89bd[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-5)
Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClass

                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 0: 100%|██████████| 292/292 [02:17<00:00,  2.13it/s, v_num=24]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/18 [00:00<?, ?it/s][A
Validation DataLoader 0:   6%|▌         | 1/18 [00:00<00:00, 82.96it/s][A
Validation DataLoader 0:  11%|█         | 2/18 [00:00<00:00, 79.59it/s][A
Validation DataLoader 0:  17%|█▋        | 3/18 [00:00<00:00, 78.76it/s][A
Validation DataLoader 0:  22%|██▏       | 4/18 [00:00<00:00, 24.61it/s][A
Validation DataLoader 0:  28%|██▊       | 5/18 [00:00<00:00, 16.03it/s][A
Validation DataLoader 0:  33%|███▎      | 6/18 [00:00<00:00, 13.00it/s][A
Validation DataLoader 0:  39%|███▉      | 7/18 [00:00<00:00, 11.46it/s][A
Validation DataLoader 0:  44%|████▍     | 8/18 [00:00<00:00, 10.52it/s][A
Validation DataLoader 0:  50%|█████     | 9/18 [00:00<00:00,  9.88it/s][A
Validation DataLoader 0:  56%|█████▌    | 10/18 [00:01<00:00,  9.42it/s][A
Validation DataLoader 0: 

[33m[W 2023-04-11 16:08:18,891][0m Trial 0 failed with parameters: {'batch_size': 32, 'max_epoch': 1, 'learning_rate': 1e-05, 'weight_decay': 0.01} because of the following error: AttributeError("'Trainer' object has no attribute 'training_type_plugin'").[0m
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_2622/2666820931.py", line 48, in objective
    trainer.fit(model=model, datamodule=dataloader)
  File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 520, in fit
    call._call_and_handle_interrupt(
  File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 559, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)


AttributeError: 'Trainer' object has no attribute 'training_type_plugin'