In [None]:
!pip install -U datasets
!pip install -U torch
!pip install -U transformers
!pip install -U ipywidgets

In [2]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, ElectraForSequenceClassification, AutoTokenizer
from datasets import load_dataset, load_metric
import pandas as pd
import numpy as np

In [None]:
# tokenizer from koelectra-base-v3-discriminator pre-trained model 
BERT_MODEL = 'monologg/koelectra-base-v3-discriminator'

bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
data_collator = DataCollatorWithPadding(tokenizer=bert_tokenizer)

In [None]:
# load dataset from CSV
review_dataset_dict = load_dataset('csv', data_files='nsmc_merged.csv', sep=',', names=['document','label'])

# dict 에서 train 데이터를 기준으로 dataset 객체 반환
review_dataset = review_dataset_dict['train']

In [15]:
def preprocess(data):
    return bert_tokenizer(data['document'], padding='max_length', truncation=True, max_length=128,pad_to_max_length=True,add_special_tokens=True)

In [None]:
review_dataset = review_dataset.map(preprocess, batched=True, batch_size=len(review_dataset))

In [17]:
# Dataset 클래스 내장 train_test_split 함수를 사용해서 test 데이터 세트를 준비합니다.
review_dataset = review_dataset.train_test_split(test_size=0.2)

In [None]:
sequence_classification_model = ElectraForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2
)

sequence_classification_model.config.id2label = {0: 'Negative', 1: 'Positive'}

In [19]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
batch_size = 16
epochs = 3

In [None]:
training_args = TrainingArguments(
    output_dir='./clf/results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_dir='./clf/logs',
    logging_strategy='epoch',
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True
)

# Define the trainer: 

trainer = Trainer(
    model=sequence_classification_model,
    args=training_args,
    train_dataset=review_dataset['train'],
    eval_dataset=review_dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [22]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: document. If document are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 39068
  Batch size = 16


{'eval_loss': 0.692876398563385,
 'eval_accuracy': 0.5126702160335824,
 'eval_runtime': 291.4413,
 'eval_samples_per_second': 134.051,
 'eval_steps_per_second': 8.379}

In [23]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: document. If document are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 156271
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 29301


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2965,0.262501,0.902119
2,0.2054,0.277246,0.907034
3,0.1363,0.335169,0.911385


The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: document. If document are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 39068
  Batch size = 16
Saving model checkpoint to ./clf/results/checkpoint-9767
Configuration saved in ./clf/results/checkpoint-9767/config.json
Model weights saved in ./clf/results/checkpoint-9767/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: document. If document are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 39068
  Batch size = 16
Saving model checkpoint to ./clf/results/checkpoint-19534
Configuration saved in ./clf/results/checkpoint-19534/config.

TrainOutput(global_step=29301, training_loss=0.2127430207826738, metrics={'train_runtime': 11327.7542, 'train_samples_per_second': 41.386, 'train_steps_per_second': 2.587, 'total_flos': 3.083747079912192e+16, 'train_loss': 0.2127430207826738, 'epoch': 3.0})

In [24]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: document. If document are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 39068
  Batch size = 16


{'eval_loss': 0.2625013589859009,
 'eval_accuracy': 0.902119381591072,
 'eval_runtime': 294.7628,
 'eval_samples_per_second': 132.54,
 'eval_steps_per_second': 8.285,
 'epoch': 3.0}

In [14]:
from transformers import TextClassificationPipeline

In [None]:
model = trainer.model
tokenizer = bert_tokenizer
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=0)

In [16]:
pipe('이런 영화 정말 오랜만 입니다. 추천해요!')

[{'label': 'Negative', 'score': 0.5037461519241333},
 {'label': 'Positive', 'score': 0.4962537884712219}]

In [17]:
pipe('처음에는 좀 망설여지기는 했는데, 그래도 오기를 잘 했다는 생각이 드네여.')

[{'label': 'Negative', 'score': 0.5037662386894226},
 {'label': 'Positive', 'score': 0.4962337017059326}]

In [18]:
pipe('환율이 이렇게 오르면 우리의 수입 물가도 뛸 거라는 거, 또 미국이 금리를 계속 올리고 있는 만큼 환율이 앞으로 더 오를 거라는 겁니다.') 

[{'label': 'Negative', 'score': 0.5037504434585571},
 {'label': 'Positive', 'score': 0.4962495267391205}]

In [19]:
pipe('하지만 환자가 점차 나이가 들어가면, 이를 돌보는 가족들은 점점 무기력해진다.')

[{'label': 'Negative', 'score': 0.5037599205970764},
 {'label': 'Positive', 'score': 0.4962400794029236}]