In [None]:
!pip install datasets



In [None]:
!pip install accelerate -U

Collecting accelerate
  Using cached accelerate-0.28.0-py3-none-any.whl (290 kB)


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
import random

In [None]:
texts = ["This product has been an amazing addition to my kitchen."]*50 + \
        ["I'm very disappointed with the purchase. Would not recommend."]*50
labels = [1]*50 + [0]*50  # 1: 긍정, 0: 부정

# 데이터 섞기
data = list(zip(texts, labels))
random.shuffle(data)
texts, labels = zip(*data)

# 데이터프레임 생성
df = pd.DataFrame({'text': texts, 'label': labels})

# 데이터셋 분할
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
# Hugging Face의 datasets 라이브러리 형식으로 변환
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
train_df

Unnamed: 0,text,label
55,This product has been an amazing addition to m...,1
88,This product has been an amazing addition to m...,1
26,I'm very disappointed with the purchase. Would...,0
42,This product has been an amazing addition to m...,1
69,This product has been an amazing addition to m...,1
...,...,...
60,This product has been an amazing addition to m...,1
71,I'm very disappointed with the purchase. Would...,0
14,I'm very disappointed with the purchase. Would...,0
92,This product has been an amazing addition to m...,1


In [None]:
# 토크나이저와 모델 로딩
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# 데이터셋에 토큰화 적용
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
# 훈련 설정
training_args = TrainingArguments(
    output_dir='./results',          # 모델과 체크포인트 저장 디렉토리
    num_train_epochs=3,              # 총 훈련 에포크
    per_device_train_batch_size=8,   # 훈련 배치 크기
    per_device_eval_batch_size=8,    # 평가 배치 크기
    warmup_steps=500,                # 러닝레이트 웜업에 사용할 스텝 수
    weight_decay=0.01,               # 가중치 감소율
    logging_dir='./logs',            # 로그 저장 디렉토리
    evaluation_strategy="epoch",     # 에폭마다 평가
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# 모델 훈련
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641869
2,No log,0.483439
3,No log,0.41381


TrainOutput(global_step=30, training_loss=0.6149038950602214, metrics={'train_runtime': 23.822, 'train_samples_per_second': 10.075, 'train_steps_per_second': 1.259, 'total_flos': 63146653286400.0, 'train_loss': 0.6149038950602214, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.4138096272945404,
 'eval_runtime': 0.6576,
 'eval_samples_per_second': 30.412,
 'eval_steps_per_second': 4.562,
 'epoch': 3.0}