In [416]:
!pip install datasets sacremoses

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [417]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [418]:
train_ds = load_dataset("stanfordnlp/imdb", split="train[:5%]")
test_ds = load_dataset("stanfordnlp/imdb", split="test[:5%]")

In [419]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [420]:
def collate_fn(batch):
  max_len = 400
  texts, labels = [], []
  for row in batch:
    labels.append(row['label'])
    texts.append(row['text'])

  texts = torch.LongTensor(tokenizer(texts, padding=True, truncation=True, max_length=max_len).input_ids)
  labels = torch.LongTensor(labels)

  return texts, labels

train_loader = DataLoader(
    train_ds, batch_size=16, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_ds, batch_size=16, shuffle=False, collate_fn=collate_fn
)
# Out of memory 오류로 batch size 줄임

In [421]:
# text, label 확인하기
for texts, labels in train_loader:
    print("texts shape:", texts.shape)
    print("labels shape:", labels.shape)
    print("texts (token IDs):", texts[:2])
    print("labels:", labels[:10])
    break

texts shape: torch.Size([16, 400])
labels shape: torch.Size([16])
texts (token IDs): tensor([[  101,  2004,  1037,  2309,  2450,  2058,  2871,  1010,  1045,  2179,
          2023,  2143,  5186, 23979,  1998, 17183, 11219,  2075,  2000,  2309,
          2308,  2058,  2871,  1010,  2025,  2000,  5254,  2296,  2060,  2450,
          1010,  1997,  2151,  2287,  1012,  2009,  2001,  1037,  6517,  1010,
         17203,  3535,  2011,  1037,  2158,  2000,  4339,  1998,  3622,  1037,
          1000, 14556, 17312,  1000,  1010,  1998,  2009,  3478, 28616,  6906,
          6321,  1012,  5557, 25005,  3475,  1005,  1056,  2172,  1997,  2019,
          3883,  2000,  4088,  2007,  1010,  2021,  2445,  1996,  2512,  1011,
         25953,  1000,  5436,  1000,  1006,  1045,  5223,  2000,  2130,  6523,
          2000,  2009,  2004,  1037,  5436,  1007,  1999,  2023,  1010,  2016,
          2134,  1005,  1056,  2031,  1037,  3382,  1012,  2045,  2001,  2053,
          2839,  2458,  1010,  2053,  3114,  2

In [422]:
from torch import nn
from math import sqrt

In [423]:
# class SelfAttention(nn.Module):
#   def __init__(self, input_dim, d_model):
#     super().__init__()

#     self.input_dim = input_dim
#     self.d_model = d_model

#     self.wq = nn.Linear(input_dim, d_model)
#     self.wk = nn.Linear(input_dim, d_model)
#     self.wv = nn.Linear(input_dim, d_model)
#     self.dense = nn.Linear(d_model, d_model)

#     self.softmax = nn.Softmax(dim=-1)

#   def forward(self, x, mask):
#     q, k, v = self.wq(x), self.wk(x), self.wv(x)
#     score = torch.matmul(q, k.transpose(-1, -2))
#     score = score / sqrt(self.d_model)

#     if mask is not None:
#       score = score + (mask * -1e9)

#     score = self.softmax(score)
#     result = torch.matmul(score, v)
#     result = self.dense(result)

#     return result

# SelfAttention Module을 Multi-head attention으로 확장
class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0 # model의 차원은 head의 수로 나누어 떨어져야함

        self.input_dim = input_dim
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_head = d_model // n_heads

        self.wq = nn.Linear(input_dim, d_model)
        self.wk = nn.Linear(input_dim, d_model)
        self.wv = nn.Linear(input_dim, d_model)
        self.dense = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, mask=None):
        print(x.shape)
        B, S, _ = x.shape
        
        q, k, v = self.wq(x), self.wk(x), self.wv(x) # (B, S, D)
        # print("q shape", q.shape)
        # print("k shape", k.shape)
        # print("v shape", v.shape)

        # Q, K, V (B, S, D)를 (B, S, H, D')로 reshape 
        # D = H X D' => D' = D / H = d_head

        # [step 1] (B, S, D) -> (B, H, S, D')
        q = q.view(B, S, self.n_heads, self.d_head).transpose(1, 2)
        k = k.view(B, S, self.n_heads, self.d_head).transpose(1, 2)
        v = v.view(B, S, self.n_heads, self.d_head).transpose(1, 2)

        # print("q reshaped:", q.shape)
        # print("k reshaped:", k.shape)
        # print("v reshaped:", v.shape)

        # [step 2] Attention score : (B, H, S, D') X (B, H, D', S) = (B, H, S, S)
        score = torch.matmul(q, k.transpose(-1, -2)) / sqrt(self.d_head)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)
            score = score + (mask * -1e9)

        score = self.softmax(score)
        # print("score", score.shape)
        
        result = torch.matmul(score, v) # (B, H, S, S) X (B, H, S, D') = (B, H, S, D')
        # print("result 1 shape", result.shape)

        # transpose(1, 2)하고 나면 (B, S, H, D')
        # 다시 (S, D)로 reshape
        # contiguous()는 transpose하고 나서 tensor의 연속성을 보장해주기위해 사용함 (안 하면 오류 발생)
        result = result.transpose(1, 2).contiguous().view(B, S, self.d_model) 
        # print("result 2 shape", result.shape)
        result = self.dense(result)

        return result

In [424]:
# 테스트 코드
B, S, input_dim = 2, 5, 32
x = torch.randn(B, S, input_dim)
mha = MultiHeadAttention(input_dim=32, d_model=32, n_heads=2)
_ = mha(x)

torch.Size([2, 5, 32])


* contiguous를 안 넣어줬을 때  
RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [426]:
class TransformerLayer(nn.Module):
  def __init__(self, input_dim, d_model, dff, n_heads, dropout=0.1):
    super().__init__()

    self.input_dim = input_dim
    self.d_model = d_model
    self.dff = dff
    self.n_heads = n_heads

    self.mha = MultiHeadAttention(input_dim, d_model, n_heads)
    
    self.ffn = nn.Sequential(
      nn.Linear(d_model, dff),
      nn.ReLU(),
      nn.Linear(dff, d_model)
    )

    self.dropout1 = nn.Dropout(dropout)
    self.norm1 = nn.LayerNorm(d_model)

    self.dropout2 = nn.Dropout(dropout)
    self.norm2 = nn.LayerNorm(d_model)

  def forward(self, x, mask):
    x1 = self.mha(x, mask)
    x1 = self.dropout1(x1)
    x1 = self.norm1(x1 + x)

    x2 = self.ffn(x1)
    x2 = self.dropout2(x2)
    x2 = self.norm2(x2 + x1)

    return x2

In [427]:
import numpy as np

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, None], np.arange(d_model)[None, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[None, ...]

    return torch.FloatTensor(pos_encoding)

max_len = 400
print(positional_encoding(max_len, 256).shape)

torch.Size([1, 400, 256])


In [428]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, dff, n_heads):
        super().__init__()

        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.dff = dff
        self.n_heads = n_heads

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.parameter.Parameter(positional_encoding(max_len, d_model), requires_grad=False)
        self.layers = nn.ModuleList([
            TransformerLayer(d_model, d_model, dff, n_heads) for _ in range(n_layers)
        ])
        self.classification = nn.Linear(d_model, 2) # 출력 차원을 2 (0: 부정, 1: 긍정)

    def forward(self, x):
        mask = (x == tokenizer.pad_token_id)
        mask = mask[:, None, :]
        # print(mask.shape)
        seq_len = x.shape[1]

        x = self.embedding(x)
        x = x * sqrt(self.d_model)
        x = x + self.pos_encoding[:, :seq_len]

        for layer in self.layers:
            x = layer(x, mask)

        x1 = x[:, -1]
        # x2 = x[:, 0]
        # print(x1, x2)
        x1 = self.classification(x1)
        # x2 = self.classification(x2)
        return x1


In [429]:
# 5-layer 4-head Transformer 모델
model = TextClassifier(len(tokenizer), d_model=64, n_layers=5, dff=128, n_heads=4)

In [430]:
print(torch.__version__)
print(torch.backends.mps.is_built())
print(torch.backends.mps.is_available())

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

2.6.0
True
True


In [431]:
from torch.optim import Adam

lr = 0.001
model = model.to(device)
loss_fn = nn.CrossEntropyLoss() # loss_fn 변경 : 다중 분류에 적합한 CrossEntropyLoss

optimizer = Adam(model.parameters(), lr=lr)

In [432]:
def accuracy(model, dataloader):
  correct = 0
  total = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)
    
    outputs = model(inputs) # 출력의 차원은 (B, vocab_size)
    print(outputs.shape)
    preds = torch.argmax(outputs, dim=-1) # 마지막 차원에서 가장 큰 값을 고름
      
    correct += (labels == preds).sum().item()
    total += labels.size(0)

  return correct / total

In [433]:
train_accs = []
test_accs = []

n_epochs = 50

for epoch in range(n_epochs):
    total_loss = 0.
    model.train()

    for data in train_loader:
        model.zero_grad()
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        print("labels", labels[:0])

        preds = model(inputs)
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch + 1} | Train Loss: {total_loss}')

    with torch.no_grad():
        model.eval()
        train_acc = accuracy(model, train_loader)
        test_acc = accuracy(model, test_loader)
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        print(f'=====> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}')

labels tensor([], device='mps:0', dtype=torch.int64)
torch.Size([16, 400, 64])


RuntimeError: shape '[16, 400, 64]' is invalid for input of size 6553600