In [22]:
import torch
from transformers import BertTokenizer, BertModel # bert model은 주로 임베딩 추출, 특성 추출

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

model_ckpt = 'bert-base-uncased'

model = BertModel.from_pretrained(model_ckpt).to(device)
tokenizer = BertTokenizer.from_pretrained(model_ckpt)



cuda


In [34]:
texts = ["Hello, how are you?", "Transformers are awesome!"]
tokenized = tokenizer(texts, 
                   padding = True, 
                   truncation = True,
                   return_tensors = 'pt')

# 각 key의 value 값들을 동일 device로 
inputs = {k: v.to(device) for k, v in tokenized.items()}
print(f"tokenizing 한 직후: {tokenized}\n\n")

print(inputs)

tokenizing 한 직후: {'input_ids': tensor([[  101,  7592,  1010,  2129,  2024,  2017,  1029,   102],
        [  101, 19081,  2024, 12476,   999,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0]])}


{'input_ids': tensor([[  101,  7592,  1010,  2129,  2024,  2017,  1029,   102],
        [  101, 19081,  2024, 12476,   999,   102,     0,     0]],
       device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0]], device='cuda:0')}


In [24]:
with torch.no_grad():
    outputs = model(**inputs)
    
last_hidden_state = outputs.last_hidden_state
# pooler_output: (batch_size, hidden_size)
pooler_output = outputs.pooler_output

print("last_hidden_state.shape:", last_hidden_state.shape)
print("pooler_output.shape:", pooler_output.shape)  # 문장 전체 임베딩 (CLS 토큰 임베딩)

last_hidden_state.shape: torch.Size([2, 8, 768])
pooler_output.shape: torch.Size([2, 768])


In [36]:
tokenizer.convert_ids_to_tokens(tokenized['input_ids'][0])

['[CLS]', 'hello', ',', 'how', 'are', 'you', '?', '[SEP]']

In [37]:
tokenizer.decode(tokenized['input_ids'][0])

'[CLS] hello, how are you? [SEP]'

In [38]:
tokenizer.decode(tokenized['input_ids'][0], skip_special_tokens = True)

'hello, how are you?'

### Sentimantal Classifier

In [27]:
from transformers import BertForSequenceClassification

# 감성분류용 사전학습 모델 로드 (num_labels=2)
classifier = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
classifier.eval()

texts = ["I love this movie!", "This movie is terrible..."]

inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = classifier(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, axis = 1)

print(f"\nPredicted class: {predicted_class.cpu().numpy()}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Predicted class: [0 0]


### Base BERT Fine-tuning

In [46]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# 커스텀 데이터셋
class SimpleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length', # max_length까지 padding 
            truncation=True, 
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()} # batch 차원 추가 
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [47]:
texts = ["I love this!", "This is bad.", "Amazing work.", "I hate it."]
labels = [1, 0, 1, 0]  # 1: 긍정, 0: 부정

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = SimpleDataset(texts, labels, tokenizer)
loader = DataLoader(dataset, batch_size=2, shuffle=True, drop_last = True)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# AdamW 사용하여 학습 중 가중치 업데이트 
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# 학습 함수
def train(model, dataloader):
    model.train()
    total_loss = 0
    total_correct = 0
    for batch in dataloader:
        optimizer.zero_grad()
        
        ## 동일 device로 변환 
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        total_correct += (preds == labels).sum().item()

    print(f"\nLoss: {total_loss/len(dataloader):.4f}, Accuracy: {total_correct/len(dataset):.4f}")

# 5 에폭 정도 학습
for epoch in range(5):
    print(f"Epoch {epoch+1}")
    train(model, loader)

Epoch 1

Loss: 0.8452, Accuracy: 0.5000
Epoch 2

Loss: 0.4620, Accuracy: 1.0000
Epoch 3

Loss: 0.2794, Accuracy: 1.0000
Epoch 4

Loss: 0.2255, Accuracy: 1.0000
Epoch 5

Loss: 0.2172, Accuracy: 1.0000
