🤗 라이브러리 설치후 런타임 재시작 🤗

In [None]:
!pip install transformers sentencepiece datasets accelerate -qqq

### 3.1.3. 🤗 허브에서 불러오기

In [None]:
from datasets import load_dataset

# dataset = load_dataset('wikitext', 'wikitext-103-v1')


## 3.2. 데이터 전처리 : Dataset.map()

### 3.2.1. Dataset.map() 기본 용법

In [None]:
from datasets import load_dataset

dataset = load_dataset('imdb', split="train[:1000]")




In [None]:
train_dataset[0]['length']

In [None]:
len(train_dataset[0]['text'])

### 3.2.2. 토크나이저 적용

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



### 3.2.3. 병렬 처리

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(examples):
    return tokenizer(examples["text"], padding=True, max_length=50)

tokenized_dataset = dataset.map(tokenize, batched=True)

# 4. 훈련 API (Trainer)

## 4.2. Trainer API 사용 예제

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset


# 1. 작업 정의: 문장 생성

# 2. 학습 데이터 로딩
train_dataset = load_dataset('wikitext', 'wikitext-103-raw-v1', split='train[:1000]')


# 3. 토크나이저와 모델 로딩

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')


# 4. 학습 데이터 전처리
def tokenize_function(examples):
	output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
	return output

# map 함수를 이용한 토크나이징
tokenized_datasets = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# 데이터 콜레이터 설정
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [None]:
# 5. 모델 학습
# 학습을 위한 설정
training_args = TrainingArguments(
	output_dir="./gpt2_finetuned",
	overwrite_output_dir=True,
	num_train_epochs=3,
	per_device_train_batch_size=2,
	save_steps=1000,
	save_total_limit=2,
)

# Trainer 객체를 생성
trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=data_collator,
	train_dataset=tokenized_datasets,
)

# 학습 시작
trainer.train()

## 4.3. DataCollator

### DataCollatorForLanguageModeling

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

### DataCollatorForTokenClassification

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer
)

### DataCollatorWithPadding

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer
)

### DataCollatorForSeq2Seq

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer
)