# 패키지 설치

In [1]:
!pip install transformers datasets accelerate evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# 1. 문장 생성(이어쓰기)
- https://huggingface.co/docs/transformers/en/model_doc/gpt2

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

prompt = "The cake was so delicious that"
inputs = tokenizer(prompt, return_tensors="pt")

gen_tokens = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
print(gen_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The cake was so delicious that when I told my mom that I was getting ready to take it out, she said, "Oh, you've got to go, but you must also go to the bathroom in the middle of this one."

She gave me one that tasted better than my other. She put it in my dresser and said to see the picture. We did, and eventually we settled down and enjoyed our time together. It's also a good reminder to be grateful for the


# 2. 리뷰 감성 분석
## 데이터셋 불러오기

In [5]:
from datasets import load_dataset

dataset = load_dataset("jhan21/amazon-food-reviews-dataset")

dataset["train"] = dataset["train"].select(range(1000)) # 일부만 선택
dataset = dataset["train"].train_test_split(test_size=0.2)

Generating train split: 100%|██████████| 568454/568454 [00:02<00:00, 278529.32 examples/s]


In [6]:
# 어떤 컬럼이 label인지 확인
print(dataset["train"].features)

{'Id': Value('int64'), 'ProductId': Value('string'), 'UserId': Value('string'), 'ProfileName': Value('string'), 'HelpfulnessNumerator': Value('int64'), 'HelpfulnessDenominator': Value('int64'), 'Score': Value('int64'), 'Time': Value('int64'), 'Summary': Value('string'), 'Text': Value('string')}


## 토크나이저 준비 및 데이터 전처리

In [7]:
from transformers import AutoTokenizer, DataCollatorWithPadding

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 800/800 [00:00<00:00, 16545.99 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 13543.56 examples/s]


## 포맷 지정 및 label 컬럼 명시

In [8]:
# label 컬럼 이름을 labels로 지정
tokenized_datasets = tokenized_datasets.rename_column("Score", "labels")

# 모델 학습을 위해 torch 텐서 포맷으로 지정
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]


In [9]:
print(max(set(train_dataset["labels"])))        # 중복 제거를 위해 set 사용
print(min(set(train_dataset["labels"])))

tensor(5)
tensor(1)


In [10]:
# 레이블을 0~4로 변경
def shift_labels(example):
    example["labels"] -= 1
    return example

train_dataset = train_dataset.map(shift_labels)
eval_dataset = eval_dataset.map(shift_labels)

Map: 100%|██████████| 800/800 [00:00<00:00, 2813.82 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 2859.66 examples/s]


In [11]:
print(max(set(train_dataset["labels"])))
print(min(set(train_dataset["labels"])))

tensor(4)
tensor(0)


## 모델 로드 (문장 분류용)

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 평가 지표 정의 (정확도)

In [14]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred              # 둘 다 np.ndarray(댜차원 배열)
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)


Downloading builder script: 4.20kB [00:00, 3.65MB/s]


## Trainer 설정 및 학습

In [19]:
import torch
torch.cuda.empty_cache()

In [20]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="test-trainer",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=300, training_loss=0.7923973083496094, metrics={'train_runtime': 111.2565, 'train_samples_per_second': 21.572, 'train_steps_per_second': 2.696, 'total_flos': 308362129304880.0, 'train_loss': 0.7923973083496094, 'epoch': 3.0})

## 평가 및 예측

In [21]:
import numpy as np

eval_result = trainer.evaluate()
print(f"Test Accuracy: {eval_result['eval_accuracy']:.4f}")

# 예측 테스트
text = "This product is awesome!"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model(**inputs)
pred = outputs.logits.argmax(dim=1).item()
print("예측 레이블:", pred)

Test Accuracy: 0.6900
예측 레이블: 4


## 3. 리뷰 요약

### 데이터셋불러오기

In [23]:
import kagglehub
import pandas as pd
from datasets import Dataset

# 데이터셋 다운로드 및 불러오기
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")
df = pd.read_csv(f"{path}/Reviews.csv")
df = df[['Text', 'Summary']].iloc[:50000]

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)
print(dataset['train'][0])

Downloading from https://www.kaggle.com/api/v1/datasets/download/snap/amazon-fine-food-reviews?dataset_version_number=2...


100%|██████████| 242M/242M [00:07<00:00, 33.1MB/s] 

Extracting files...





{'Text': 'The product is advertised.  I was a little leery about buying it due to all the negative reviews about the dents during shipping, etc.  I received the 2 cases with no shipping delays.  A few of the peripheral cans had very minor dents but nothing worse than what I would purchase in a store.  No issues for me.', 'Summary': 'shipped just fine'}


In [24]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

# Text 컬럼
text_lengths = [
    len(tokenizer(text=t, add_special_tokens=False)["input_ids"])  # ← text=
    for t in dataset["train"]["Text"]
    if t                                                          # None/빈 문자열 방어
]

# Summary 컬럼
summary_lengths = [
    len(tokenizer(text=s, add_special_tokens=False)["input_ids"])  # ← text=
    for s in dataset["train"]["Summary"]
    if s
]

def print_stats(name, lengths):
    print(f"[{name}]")
    print(f"  평균 길이   : {np.mean(lengths):.1f}")
    print(f"  90% 이하 길이: {np.percentile(lengths, 90):.0f}")
    print(f"  최대 길이   : {np.max(lengths)}\n")

print_stats("Text", text_lengths)
print_stats("Summary", summary_lengths)

[Text]
  평균 길이   : 103.0
  90% 이하 길이: 207
  최대 길이   : 2363

[Summary]
  평균 길이   : 5.8
  90% 이하 길이: 10
  최대 길이   : 63



## 토크나이저 준비 및 데이터 전처리

In [25]:
# 텍스트 입력 길이 (인코더)
max_input_length  = 256   # 90% 길이 207을 256(2의 거듭제곱)으로 여유 있게 상향

# 요약 타깃 길이 (디코더)
max_target_length = 16    # 90% 길이 10을 근사 16(8의 배수)으로 상향

In [26]:
def preprocess_function(examples):
    # None · 숫자 · NaN 등을 깨끗한 문자열로 변환
    texts     = [str(t) if t is not None else "" for t in examples["Text"]]
    summaries = [str(s) if s is not None else "" for s in examples["Summary"]]

    inputs = tokenizer(
        text=texts,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )
    targets = tokenizer(
        text=summaries,
        max_length=max_target_length,
        truncation=True,
        padding="max_length",
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,   # 필요 없으면 지워도 됨
)

tokenized_datasets.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

train_dataset = tokenized_datasets["train"]
eval_dataset  = tokenized_datasets["test"]

Map: 100%|██████████| 40000/40000 [00:03<00:00, 12771.64 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 11590.90 examples/s]


## 모델 로드 (BART 요약용)

In [27]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

## 평가 지표 정의 (ROUGE)

In [28]:
!pip install -q rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script: 6.14kB [00:00, 4.94MB/s]


## Trainer 설정 및 학습

In [30]:
from transformers import TrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = TrainingArguments(
    output_dir="test-trainer-2",              # 결과 저장 폴더
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,     # 정확도 측정 함수
)

trainer.train()

Step,Training Loss
500,2.0318
1000,1.5695
1500,1.4741
2000,1.442
2500,1.4284
3000,1.4445
3500,1.4242
4000,1.4201
4500,1.3607
5000,1.357




TrainOutput(global_step=15000, training_loss=1.222372499593099, metrics={'train_runtime': 4271.4264, 'train_samples_per_second': 28.094, 'train_steps_per_second': 3.512, 'total_flos': 1.82920937472e+16, 'train_loss': 1.222372499593099, 'epoch': 3.0})

In [31]:
test_text = "I absolutely loved this product. The flavor was amazing and I will definitely buy it again."

inputs = tokenizer(test_text, return_tensors="pt", max_length=256, truncation=True).to(device)
summary_ids = model.generate(**inputs, max_length=48, num_beams=4, early_stopping=True)

print(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

Delicious!


In [32]:
import torch

# 1. GPU 장치 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 2. test 셋 선택
test_texts = dataset["test"]["Text"][:10]  # 예시로 10개만 추론

# 3. 요약 결과 저장용 리스트
summaries = []

# 4. 문장 하나씩 요약
for text in test_texts:
    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=256,
        truncation=True,
        padding="max_length"
    ).to(device)  # 입력도 GPU로 이동

    summary_ids = model.generate(
        **inputs,
        max_length=48,
        num_beams=4,
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)

# 5. 출력 확인
for i in range(len(test_texts)):
    print(f"[원문] {test_texts[i]}")
    print(f"[요약] {summaries[i]}\n")

[원문] My husband (who, being Mexican, is very picky about his tortilla chips) and I absolutely love these!  The texture is light and crispy, rather than thick and crunchy. He actually usually prefers a very hearty, cruncy chip (Like El Ranchero), but the flavor of these is so fantastic that we're both thilled with them. The bean, rice and corn base makes them incredibly flavorful, and they have a touch of onion and garlic in addition to that. We go through an embarrassing amount of them.  I never, ever like plain chips, but these I can eat without anything else, although they're particularly amazing with a fresh salsa.  I highly recommend these!
[요약] Best tortilla chips ever!

[원문] Timothy's World Hazelnut Decaf Coffee is great tasting for decaf coffee.  Excellent service. Product arrived within 4 days after ordering.
[요약] Great tasting decaf coffee

[원문] Yes, these do contain an occasional stem or seed.  I just pull it off before eating and life goes on.  These are a great snack for we