In [None]:
%load_ext autoreload
%autoreload 2


In [None]:

import os
import pandas as pd

from pprint import pprint
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR
from pytorch_lightning import LightningModule, Trainer, seed_everything
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BartForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import emoji
from soynlp.normalizer import repeat_normalize
from tokenization_hanbert import HanBertTokenizer
from kobart import get_pytorch_kobart_model, get_kobart_tokenizer


In [None]:
args = {
    'random_seed': 42, # Random Seed
    'pretrained_model': 'beomi/kcbert-base',  # Transformers PLM name
    'pretrained_tokenizer': '',  # Optional, Transformers Tokenizer Name. Overrides `pretrained_model`
    'batch_size': 32,
    'lr': 5e-6,  # Starting Learning Rate
    'epochs': 20,  # Max Epochs
    'max_length': 150,  # Max Length input size
    'train_data_path': "../nsmc/ratings_train.txt",  # Train Dataset file 
    'val_data_path': "../nsmc/ratings_test.txt",  # Validation Dataset file 
    'test_mode': False,  # Test Mode enables `fast_dev_run`
    'optimizer': 'AdamW',  # AdamW vs AdamP
    'lr_scheduler': 'exp',  # ExponentialLR vs CosineAnnealingWarmRestarts
    'fp16': False,  # Enable train on FP16
    'tpu_cores': 0,  # Enable TPU with 1 core or 8 cores
    'cpu_workers': 4
}

In [None]:
train_path = "nsmc/ratings_train.txt"
test_path = "nsmc/ratings_test.txt"

In [None]:

from tokenization_hanbert import HanBertTokenizer
tokenizer = HanBertTokenizer.from_pretrained('HanBart-54kN')
model = BartForSequenceClassification.from_pretrained('../model_checkpoint/HanBart_202110220849/saved_checkpoint_350')

In [None]:
from nsmcDatamodule import NsmcDataModule, nsmcDataset

In [None]:
datamodule = NsmcDataModule(train_path, test_path, tokenizer)
datamodule.setup()
dataloader = datamodule.train_dataloader()

In [None]:
batch = next(iter(dataloader))

In [None]:
model

In [17]:
dataset = nsmcDataset(train_path, tokenizer, 512)

In [18]:
dataset[0]

{'input_ids': tensor([    3,   466, 19472, 53147, 53147,  1671, 24226, 37614,  1144,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   