In [None]:
# HuggingFace의 모델 사용하기

In [1]:
## 파이프라인을 이용해서 직접 추론하기

from transformers import pipeline

# 감정 분석 파이프라인 설정
sentiment_analysis = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# 예시 영화 평
reviews = [
    "This movie was absolutely fantastic! The acting, the plot, everything was perfect.",
    "I did not enjoy this movie at all. The story was predictable and the acting was terrible.",
    "A solid performance by the lead actor, but the plot could have been better.",
    "What a waste of time! I can't believe I sat through the whole movie."
]

# 영화 평 분석
results = sentiment_analysis(reviews)

# 결과 출력
for review, result in zip(reviews, results):
    print(f"Review: {review}\nSentiment: {result['label']} with score {result['score']:.2f}\n")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


Review: This movie was absolutely fantastic! The acting, the plot, everything was perfect.
Sentiment: POSITIVE with score 1.00

Review: I did not enjoy this movie at all. The story was predictable and the acting was terrible.
Sentiment: NEGATIVE with score 1.00

Review: A solid performance by the lead actor, but the plot could have been better.
Sentiment: POSITIVE with score 0.68

Review: What a waste of time! I can't believe I sat through the whole movie.
Sentiment: NEGATIVE with score 1.00



In [2]:
## 모델 파인튜닝하기

from datasets import load_dataset

# IMDb 데이터셋 불러오기
dataset = load_dataset("imdb")

# 데이터셋 확인
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
from transformers import AutoTokenizer

# 토크나이저 설정 (distilbert-base-uncased 사용)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# 전처리 함수 정의
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# 데이터셋 전처리
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:10<00:00, 2327.72 examples/s]


In [4]:
from transformers import AutoModelForSequenceClassification

# 감정 분석을 위한 DistilBERT 모델 불러오기 (2개의 클래스로 분류)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import TrainingArguments, Trainer

# 학습 인자 설정
training_args = TrainingArguments(
    output_dir="./results",             # 출력 디렉토리
    learning_rate=2e-5,                 # 학습률
    per_device_train_batch_size=16,     # 학습 시 배치 크기
    per_device_eval_batch_size=16,      # 평가 시 배치 크기
    num_train_epochs=3,                 # 에포크 수
    weight_decay=0.01,                  # 가중치 감소 (regularization)
)

# Trainer 생성
trainer = Trainer(
    model=model,                        # 학습할 모델
    args=training_args,                 # 학습 인자
    train_dataset=tokenized_datasets["train"],  # 학습 데이터셋
    eval_dataset=tokenized_datasets["test"],    # 평가 데이터셋
)

In [None]:
# 모델 학습
trainer.train()

In [None]:
# 모델 평가
results = trainer.evaluate()
print(results)

In [None]:
## 파인튜닝된 모델 저장하기

# 모델과 토크나이저 경로
model_path = "./fine_tuned_distilbert_imdb"

# 모델 저장
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)