In [1]:
%load_ext autoreload
%autoreload 2


In [13]:

import os
import pandas as pd

from pprint import pprint
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR
from pytorch_lightning import LightningModule, Trainer, seed_everything
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BartForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import emoji
from soynlp.normalizer import repeat_normalize
from tokenization_hanbert import HanBertTokenizer
from kobart import get_pytorch_kobart_model, get_kobart_tokenizer
from nsmcDatamodule import NsmcDataModule


In [8]:
args = {
    'model_name': 'HanBart-54kN',
    'random_seed': 42, # Random Seed
    'pretrained_model': 'beomi/kcbert-base',  # Transformers PLM name
    'pretrained_tokenizer': '',  # Optional, Transformers Tokenizer Name. Overrides `pretrained_model`
    'batch_size': 32,
    'lr': 5e-6,  # Starting Learning Rate
    'epochs': 20,  # Max Epochs
    'max_length': 150,  # Max Length input size
    'train_data_path': "../nsmc/ratings_train.txt",  # Train Dataset file 
    'val_data_path': "../nsmc/ratings_test.txt",  # Validation Dataset file 
    'test_mode': True,  # Test Mode enables `fast_dev_run`
    'optimizer': 'AdamW',  # AdamW vs AdamP
    'lr_scheduler': 'exp',  # ExponentialLR vs CosineAnnealingWarmRestarts
    'fp16': False,  # Enable train on FP16
    'tpu_cores': 0,  # Enable TPU with 1 core or 8 cores
    'cpu_workers': 4,
    'max_len': 200,
    'data_len': 150000
}

In [9]:
train_path = "nsmc/ratings_train.txt"
test_path = "nsmc/ratings_test.txt"

In [14]:
tokenizer = HanBertTokenizer.from_pretrained('HanBart-54kN')
model = BartForSequenceClassification.from_pretrained('../model_checkpoint/HanBart_202110220849/saved_checkpoint_350', num_labels=2)



The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'HanBertTokenizer'.
Some weights of the model checkpoint at ../model_checkpoint/HanBart_202110220849/saved_checkpoint_350 were not used when initializing BartForSequenceClassification: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights

In [24]:
datamodule = NsmcDataModule(args['train_data_path'], args['val_data_path'], tokenizer, batch_size=args['batch_size'], max_len=args['max_len'])
datamodule.setup()

In [25]:
dataloader = datamodule.train_dataloader()

In [26]:
batch = next(iter(dataloader))

In [28]:
input_ids, attention_mask, labels =batch['input_ids'], batch['attention_mask'], batch['labels']
output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

In [29]:
# Transformers 4.0.0+
loss = output.loss
logits = output.logits

preds = logits.argmax(dim=-1)

y_true = list(labels.cpu().numpy())
y_pred = list(preds.cpu().numpy())

In [31]:
y_pred

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]