In [1]:
import argparse

import pandas as pd

from tqdm.auto import tqdm

import transformers
import torch
import torchmetrics
import pytorch_lightning as pl

from sklearn.model_selection import KFold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets=[]):
        self.inputs = inputs
        self.targets = targets

    # 학습 및 추론 과정에서 데이터를 1개씩 꺼내오는 곳
    def __getitem__(self, idx):
        # 정답이 있다면 else문을, 없다면 if문을 수행합니다
        if len(self.targets) == 0:
            return torch.tensor(self.inputs[idx])
        else:
            return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

    # 입력하는 개수만큼 데이터를 사용합니다
    def __len__(self):
        return len(self.inputs)

In [3]:
class KfoldDataloader(pl.LightningDataModule):
    def __init__(self, model_name, batch_size, shuffle, k, split_seed, num_splits, train_path, dev_path, test_path, predict_path):
        super().__init__()
        self.model_name = model_name
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.k = k
        self.split_seed = split_seed
        self.num_splits = num_splits

        self.train_path = train_path
        self.dev_path = dev_path
        self.test_path = test_path
        self.predict_path = predict_path
        
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
        self.predict_dataset = None

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, max_length=160)
        self.target_columns = ['label']
        self.delete_columns = ['id']
        self.text_columns = ['sentence_1', 'sentence_2']

    def tokenizing(self, dataframe):
        data = []
        for idx, item in tqdm(dataframe.iterrows(), desc='tokenizing', total=len(dataframe)):
            # 두 입력 문장을 [SEP] 토큰으로 이어붙여서 전처리합니다.
            text = '[SEP]'.join([item[text_column] for text_column in self.text_columns])
            outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True)
            data.append(outputs['input_ids'])

        return data

    def preprocessing(self, data):
        # 안쓰는 컬럼을 삭제합니다.
        data = data.drop(columns=self.delete_columns)

        # 타겟 데이터가 없으면 빈 배열을 리턴합니다.
        try:
            targets = data[self.target_columns].values.tolist()
        except:
            targets = []
        # 텍스트 데이터를 전처리합니다.
        inputs = self.tokenizing(data)

        return inputs, targets

    def setup(self, stage='fit'):
        if stage == 'fit':
            # 데이터 준비
            total_data = pd.read_csv(self.train_path)
            total_inputs, total_targets = self.preprocessing(total_data)
            total_dataset = Dataset(total_inputs, total_targets)

            # 데이터셋 num_splits 번 fold
            kf = KFold(n_splits=self.num_splits, shuffle=self.shuffle, random_state=self.split_seed)
            all_splits = [k for k in kf.split(total_data)]

            # k번째 fold 된 데이터셋의 index 선택
            train_indexes, val_indexes = all_splits[self.k]
            train_indexes, val_indexes = train_indexes.tolist(), val_indexes.tolist()

            # fold한 index에 따라 데이터셋 분할
            self.train_dataset = [total_dataset[x] for x in train_indexes]
            self.val_dataset = [total_dataset[x] for x in val_indexes]

        else:
            # 평가데이터 준비
            test_data = pd.read_csv(self.test_path)
            test_inputs, test_targets = self.preprocessing(test_data)
            self.test_dataset = Dataset(test_inputs, test_targets)
            
            predict_data = pd.read_csv(self.predict_path)
            predict_inputs, predict_targets = self.preprocessing(predict_data)
            self.predict_dataset = Dataset(predict_inputs, [])

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=self.shuffle)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset, batch_size=self.batch_size)

In [4]:
class Model(pl.LightningModule):
    def __init__(self, model_name, lr):
        super().__init__()
        self.save_hyperparameters()

        self.model_name = model_name
        self.lr = lr

        # 사용할 모델을 호출합니다.
        self.plm = transformers.AutoModelForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_name, num_labels=1)
        # Loss 계산을 위해 사용될 L1Loss를 호출합니다.
        self.loss_func = torch.nn.HuberLoss()

    def forward(self, x):
        x = self.plm(x)['logits']

        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_func(logits, y.float())
        self.log("train_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_func(logits, y.float())
        self.log("val_loss", loss)

        self.log("val_pearson", torchmetrics.functional.pearson_corrcoef(logits.squeeze(), y.squeeze()))

        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)

        self.log("test_pearson", torchmetrics.functional.pearson_corrcoef(logits.squeeze(), y.squeeze()))

    def predict_step(self, batch, batch_idx):
        x = batch
        logits = self(x)

        return logits.squeeze()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer

In [5]:
if __name__ == '__main__':
    folder_path = '/opt/ml/level1_semantictextsimilarity-nlp-14/SH'
    # 하이퍼 파라미터 등 각종 설정값을 입력받습니다
    # 터미널 실행 예시 : python3 run.py --batch_size=64 ...
    # 실행 시 '--batch_size=64' 같은 인자를 입력하지 않으면 default 값이 기본으로 실행됩니다
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', default='klue/roberta-small', type=str)
    parser.add_argument('--batch_size', default=16, type=int)
    parser.add_argument('--max_epoch', default=1, type=int)
    parser.add_argument('--shuffle', default=True)
    parser.add_argument('--k', default=1, type=int)
    parser.add_argument('--split_seed', default=970514, type=int)
    parser.add_argument('--num_splits', default=5, type=int)
    parser.add_argument('--learning_rate', default=1e-5, type=float)
    parser.add_argument('--weight_decay', default=0.0, type=float)
    parser.add_argument('--num_split', default=5, type=int)
    parser.add_argument('--train_path', default=folder_path+'/data/train.csv')
    parser.add_argument('--dev_path', default=folder_path+'/data/dev.csv')
    parser.add_argument('--test_path', default=folder_path+'/data/dev.csv')
    parser.add_argument('--predict_path', default=folder_path+'/data/test.csv')
    args = parser.parse_args(args=[])

    model= Model(args.model_name, args.learning_rate)
    
    # gpu가 없으면 accelerator='cpu', 있으면 accelerator='gpu'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    accelerator = 'gpu' if device == torch.device('cuda') else 'cpu'

    results = []
    # nums_folds는 fold의 개수, k는 k번째 fold datamodule
    for k in range(args.num_split):
        dataloader = KfoldDataloader(args.model_name, 
                                 args.batch_size, 
                                 args.shuffle, 
                                 k,
                                 args.split_seed,
                                 args.num_splits,
                                 args.train_path, 
                                 args.dev_path, 
                                 args.test_path, 
                                 args.predict_path)
        dataloader.prepare_data()
        dataloader.setup()

        trainer = pl.Trainer(accelerator=accelerator, max_epochs=args.max_epoch, log_every_n_steps=1)
        trainer.fit(model=model, datamodule=dataloader)
        score = trainer.test(model=model, datamodule=dataloader)

        results.extend(score)

    # 학습이 완료된 모델을 저장합니다.
    model_name = f"./base_{args.num_splits}fold.pt"
    torch.save(model, model_name)

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'class

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 467/467 [01:54<00:00,  4.08it/s, v_num=23]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/117 [00:00<00:02, 54.70it/s][A
Validation DataLoader 0:   2%|▏         | 2/117 [00:00<00:01, 69.02it/s][A
Validation DataLoader 0:   3%|▎         | 3/117 [00:00<00:01, 75.98it/s][A
Validation DataLoader 0:   3%|▎         | 4/117 [00:00<00:01, 80.11it/s][A
Validation DataLoader 0:   4%|▍         | 5/117 [00:00<00:01, 82.98it/s][A
Validation DataLoader 0:   5%|▌         | 6/117 [00:00<00:01, 66.37it/s][A
Validation DataLoader 0:   6%|▌         | 7/117 [00:00<00:02, 41.80it/s][A
Validation DataLoader 0:   7%|▋         | 8/117 [00:00<00:03, 32.70it/s][A
Validation DataLoader 0:   8%|▊         | 9/117 [00:00<00:03, 27.99it/s][A
Validation DataLoader 0:   9%|▊         | 10/117 [00:00<00:04, 25.09it/s][A
Validation Da

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 467/467 [02:05<00:00,  3.73it/s, v_num=23]


tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2825.57it/s]
tokenizing: 100%|██████████| 1100/1100 [00:00<00:00, 2821.88it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 35/35 [00:02<00:00, 13.14it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_pearson          0.5472367405891418
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


tokenizing: 100%|██████████| 9324/9324 [00:03<00:00, 2693.57it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
tokenizing: 100%|██████████| 9324/9324 [00:03<00:00, 2533.24it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                             | Params
---------------------------------------------------------------
0 | plm       | RobertaForSequenceClassification | 68.1 M
1 | loss_func | HuberLoss                        | 0     
---------------------------------------------------------------
68.1 M    Trainable params
0         Non-trainable params
68.1 M    Total params
272.367   Total estimated model params size (MB)


                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 0: 100%|██████████| 467/467 [01:54<00:00,  4.08it/s, v_num=24]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/117 [00:00<00:01, 80.62it/s][A
Validation DataLoader 0:   2%|▏         | 2/117 [00:00<00:01, 88.08it/s][A
Validation DataLoader 0:   3%|▎         | 3/117 [00:00<00:01, 91.59it/s][A
Validation DataLoader 0:   3%|▎         | 4/117 [00:00<00:01, 93.58it/s][A
Validation DataLoader 0:   4%|▍         | 5/117 [00:00<00:01, 94.33it/s][A
Validation DataLoader 0:   5%|▌         | 6/117 [00:00<00:01, 71.60it/s][A
Validation DataLoader 0:   6%|▌         | 7/117 [00:00<00:02, 43.47it/s][A
Validation DataLoader 0:   7%|▋         | 8/117 [00:00<00:03, 33.60it/s][A
Validation DataLoader 0:   8%|▊         | 9/117 [00:00<00:03, 28.58it/s][A
Validation DataLoader 0:   9%|▊         | 10/117 [00:00<00:04, 25.49it/s][A
Validation Da

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 467/467 [02:05<00:00,  3.73it/s, v_num=24]


tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2640.59it/s]
tokenizing: 100%|██████████| 1100/1100 [00:00<00:00, 2667.01it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 35/35 [00:02<00:00, 13.14it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_pearson          0.7771030068397522
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


tokenizing: 100%|██████████| 9324/9324 [00:03<00:00, 2610.77it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
tokenizing: 100%|██████████| 9324/9324 [00:03<00:00, 2528.49it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                             | Params
---------------------------------------------------------------
0 | plm       | RobertaForSequenceClassification | 68.1 M
1 | loss_func | HuberLoss                        | 0     
---------------------------------------------------------------
68.1 M    Trainable params
0         Non-trainable params
68.1 M    Total params
272.367   Total estimated model params size (MB)


                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 0: 100%|██████████| 467/467 [01:54<00:00,  4.07it/s, v_num=25]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/117 [00:00<00:01, 76.87it/s][A
Validation DataLoader 0:   2%|▏         | 2/117 [00:00<00:01, 84.76it/s][A
Validation DataLoader 0:   3%|▎         | 3/117 [00:00<00:01, 86.92it/s][A
Validation DataLoader 0:   3%|▎         | 4/117 [00:00<00:01, 88.22it/s][A
Validation DataLoader 0:   4%|▍         | 5/117 [00:00<00:01, 88.81it/s][A
Validation DataLoader 0:   5%|▌         | 6/117 [00:00<00:01, 70.60it/s][A
Validation DataLoader 0:   6%|▌         | 7/117 [00:00<00:02, 43.03it/s][A
Validation DataLoader 0:   7%|▋         | 8/117 [00:00<00:03, 33.35it/s][A
Validation DataLoader 0:   8%|▊         | 9/117 [00:00<00:03, 28.42it/s][A
Validation DataLoader 0:   9%|▊         | 10/117 [00:00<00:04, 25.26it/s][A
Validation Da

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 467/467 [02:05<00:00,  3.73it/s, v_num=25]


tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2480.59it/s]
tokenizing: 100%|██████████| 1100/1100 [00:00<00:00, 2485.07it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 35/35 [00:02<00:00, 13.14it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_pearson          0.8335020542144775
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


tokenizing: 100%|██████████| 9324/9324 [00:03<00:00, 2804.62it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
tokenizing: 100%|██████████| 9324/9324 [00:03<00:00, 2498.75it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                             | Params
---------------------------------------------------------------
0 | plm       | RobertaForSequenceClassification | 68.1 M
1 | loss_func | HuberLoss                        | 0     
---------------------------------------------------------------
68.1 M    Trainable params
0         Non-trainable params
68.1 M    Total params
272.367   Total estimated model params size (MB)


                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 0: 100%|██████████| 467/467 [01:54<00:00,  4.07it/s, v_num=26]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/117 [00:00<00:01, 66.68it/s][A
Validation DataLoader 0:   2%|▏         | 2/117 [00:00<00:01, 72.78it/s][A
Validation DataLoader 0:   3%|▎         | 3/117 [00:00<00:01, 75.40it/s][A
Validation DataLoader 0:   3%|▎         | 4/117 [00:00<00:01, 76.62it/s][A
Validation DataLoader 0:   4%|▍         | 5/117 [00:00<00:01, 77.65it/s][A
Validation DataLoader 0:   5%|▌         | 6/117 [00:00<00:01, 70.01it/s][A
Validation DataLoader 0:   6%|▌         | 7/117 [00:00<00:02, 42.99it/s][A
Validation DataLoader 0:   7%|▋         | 8/117 [00:00<00:03, 33.31it/s][A
Validation DataLoader 0:   8%|▊         | 9/117 [00:00<00:03, 28.38it/s][A
Validation DataLoader 0:   9%|▊         | 10/117 [00:00<00:04, 25.41it/s][A
Validation Da

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 467/467 [02:05<00:00,  3.73it/s, v_num=26]


tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2477.05it/s]
tokenizing: 100%|██████████| 1100/1100 [00:00<00:00, 2512.37it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 35/35 [00:02<00:00, 13.14it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_pearson          0.8438644409179688
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


tokenizing: 100%|██████████| 9324/9324 [00:03<00:00, 2749.97it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
tokenizing: 100%|██████████| 9324/9324 [00:03<00:00, 2522.97it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                             | Params
---------------------------------------------------------------
0 | plm       | RobertaForSequenceClassification | 68.1 M
1 | loss_func | HuberLoss                        | 0     
---------------------------------------------------------------
68.1 M    Trainable params
0         Non-trainable params
68.1 M    Total params
272.367   Total estimated model params size (MB)


                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 0: 100%|██████████| 467/467 [01:54<00:00,  4.07it/s, v_num=27]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/117 [00:00<00:01, 77.49it/s][A
Validation DataLoader 0:   2%|▏         | 2/117 [00:00<00:01, 84.81it/s][A
Validation DataLoader 0:   3%|▎         | 3/117 [00:00<00:01, 88.29it/s][A
Validation DataLoader 0:   3%|▎         | 4/117 [00:00<00:01, 90.07it/s][A
Validation DataLoader 0:   4%|▍         | 5/117 [00:00<00:01, 91.50it/s][A
Validation DataLoader 0:   5%|▌         | 6/117 [00:00<00:01, 70.54it/s][A
Validation DataLoader 0:   6%|▌         | 7/117 [00:00<00:02, 43.14it/s][A
Validation DataLoader 0:   7%|▋         | 8/117 [00:00<00:03, 33.49it/s][A
Validation DataLoader 0:   8%|▊         | 9/117 [00:00<00:03, 28.49it/s][A
Validation DataLoader 0:   9%|▊         | 10/117 [00:00<00:04, 25.45it/s][A
Validation Da

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 467/467 [02:05<00:00,  3.73it/s, v_num=27]


tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2508.09it/s]
tokenizing: 100%|██████████| 1100/1100 [00:00<00:00, 2480.89it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 35/35 [00:02<00:00, 13.13it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_pearson          0.8591814041137695
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
