In [1]:
import torch
import torch.nn as nn

class EncoderOnlyTransformer(nn.Module):
    def __init__(self, T, D, H, L):
        super().__init__()
        self.layers = nn.ModuleList([EncoderLayer(T, D, H) for _ in range(L)])
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class EncoderLayer(nn.Module):
    def __init__(self, T, D, H):
        super().__init__()
        self.self_attention = MultiHeadSelfAttention(D, H)
        self.positionwise_ff = nn.Sequential(
            nn.Linear(D, D * 4),
            nn.ReLU(),
            nn.Linear(D * 4, D),
        )
        self.norm1 = nn.LayerNorm(D)
        self.norm2 = nn.LayerNorm(D)
    
    def forward(self, x):
        attn_out = self.self_attention(x)
        x = self.norm1(x + attn_out)
        ff_out = self.positionwise_ff(x)
        return self.norm2(x + ff_out)

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, D, H):
        super().__init__()
        self.H = H
        self.D = D
        self.query = nn.Linear(D, D)
        self.key = nn.Linear(D, D)
        self.value = nn.Linear(D, D)
        self.proj = nn.Linear(D, D)
    
    def forward(self, x):
        B, T, D = x.size()
        Q = self.query(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        K = self.key(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        V = self.value(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        scores = Q @ K.transpose(-2, -1) / (D // self.H) ** 0.5
        attn = torch.softmax(scores, dim=-1)
        context = (attn @ V).transpose(1, 2).contiguous().view(B, T, D)
        return self.proj(context)


In [2]:
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, T, D, H, L):
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(T, D, H) for _ in range(L)])
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, T, D, H):
        super().__init__()
        self.self_attention = MultiHeadSelfAttention(D, H)
        self.positionwise_ff = nn.Sequential(
            nn.Linear(D, D * 4),
            nn.ReLU(),
            nn.Linear(D * 4, D),
        )
        self.norm1 = nn.LayerNorm(D)
        self.norm2 = nn.LayerNorm(D)
    
    def forward(self, x):
        attn_out = self.self_attention(x, mask=True)
        x = self.norm1(x + attn_out)
        ff_out = self.positionwise_ff(x)
        return self.norm2(x + ff_out)

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, D, H):
        super().__init__()
        self.H = H
        self.D = D
        self.query = nn.Linear(D, D)
        self.key = nn.Linear(D, D)
        self.value = nn.Linear(D, D)
        self.proj = nn.Linear(D, D)
    
    def forward(self, x, mask=False):
        B, T, D = x.size()
        Q = self.query(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        K = self.key(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        V = self.value(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        scores = Q @ K.transpose(-2, -1) / (D // self.H) ** 0.5
        if mask:
            mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)
            scores = scores.masked_fill(mask, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        context = (attn @ V).transpose(1, 2).contiguous().view(B, T, D)
        return self.proj(context)


In [4]:
from torchinfo import summary

def profile_model(model, input_shape):
    print(summary(model, input_size=input_shape))

In [3]:
# 모델 설정
T, D, H, L = 128, 512, 8, 6  # 시퀀스 길이, 임베딩 차원, 헤드 수, 레이어 수

# 입력 텐서의 배치 크기 설정
batch_size = 16

# Encoder-only Transformer
encoder_model = EncoderOnlyTransformer(T, D, H, L)

# Decoder-only Transformer
decoder_model = DecoderOnlyTransformer(T, D, H, L)

In [6]:
print("Encoder-only Transformer Profiling:")
profile_model(encoder_model, (batch_size, T, D))

Encoder-only Transformer Profiling:
Layer (type:depth-idx)                        Output Shape              Param #
EncoderOnlyTransformer                        [16, 128, 512]            --
├─ModuleList: 1-1                             --                        --
│    └─EncoderLayer: 2-1                      [16, 128, 512]            --
│    │    └─MultiHeadSelfAttention: 3-1       [16, 128, 512]            1,050,624
│    │    └─LayerNorm: 3-2                    [16, 128, 512]            1,024
│    │    └─Sequential: 3-3                   [16, 128, 512]            2,099,712
│    │    └─LayerNorm: 3-4                    [16, 128, 512]            1,024
│    └─EncoderLayer: 2-2                      [16, 128, 512]            --
│    │    └─MultiHeadSelfAttention: 3-5       [16, 128, 512]            1,050,624
│    │    └─LayerNorm: 3-6                    [16, 128, 512]            1,024
│    │    └─Sequential: 3-7                   [16, 128, 512]            2,099,712
│    │    └─LayerNorm:

In [7]:
print("Decoder-only Transformer Profiling:")
profile_model(decoder_model, (batch_size, T, D))

Decoder-only Transformer Profiling:
Layer (type:depth-idx)                        Output Shape              Param #
DecoderOnlyTransformer                        [16, 128, 512]            --
├─ModuleList: 1-1                             --                        --
│    └─DecoderLayer: 2-1                      [16, 128, 512]            --
│    │    └─MultiHeadSelfAttention: 3-1       [16, 128, 512]            1,050,624
│    │    └─LayerNorm: 3-2                    [16, 128, 512]            1,024
│    │    └─Sequential: 3-3                   [16, 128, 512]            2,099,712
│    │    └─LayerNorm: 3-4                    [16, 128, 512]            1,024
│    └─DecoderLayer: 2-2                      [16, 128, 512]            --
│    │    └─MultiHeadSelfAttention: 3-5       [16, 128, 512]            1,050,624
│    │    └─LayerNorm: 3-6                    [16, 128, 512]            1,024
│    │    └─Sequential: 3-7                   [16, 128, 512]            2,099,712
│    │    └─LayerNorm:

In [8]:
class BERT(nn.Module):
    def __init__(self, T, D, H, L):
        super().__init__()
        self.layers = nn.ModuleList([EncoderLayer(T, D, H) for _ in range(L)])
    
    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask=mask)  # 추가된 마스크 입력
        return x

class EncoderLayer(nn.Module):
    def __init__(self, T, D, H):
        super().__init__()
        self.self_attention = MultiHeadSelfAttention(D, H)
        self.positionwise_ff = nn.Sequential(
            nn.Linear(D, D * 4),
            nn.ReLU(),
            nn.Linear(D * 4, D),
        )
        self.norm1 = nn.LayerNorm(D)
        self.norm2 = nn.LayerNorm(D)
    
    def forward(self, x, mask=None):
        attn_out = self.self_attention(x, mask=mask)
        x = self.norm1(x + attn_out)
        ff_out = self.positionwise_ff(x)
        return self.norm2(x + ff_out)

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, D, H):
        super().__init__()
        self.H = H
        self.D = D
        self.query = nn.Linear(D, D)
        self.key = nn.Linear(D, D)
        self.value = nn.Linear(D, D)
        self.proj = nn.Linear(D, D)
    
    def forward(self, x, mask=None):
        B, T, D = x.size()
        Q = self.query(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        K = self.key(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        V = self.value(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        scores = Q @ K.transpose(-2, -1) / (D // self.H) ** 0.5
        if mask is not None:  # 마스킹 추가
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        context = (attn @ V).transpose(1, 2).contiguous().view(B, T, D)
        return self.proj(context)


In [9]:
class GPT(nn.Module):
    def __init__(self, T, D, H, L):
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(T, D, H) for _ in range(L)])
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, T, D, H):
        super().__init__()
        self.self_attention = MultiHeadSelfAttention(D, H)
        self.positionwise_ff = nn.Sequential(
            nn.Linear(D, D * 4),
            nn.ReLU(),
            nn.Linear(D * 4, D),
        )
        self.norm1 = nn.LayerNorm(D)
        self.norm2 = nn.LayerNorm(D)
    
    def forward(self, x):
        attn_out = self.self_attention(x, mask=True)  # Causal Mask 적용
        x = self.norm1(x + attn_out)
        ff_out = self.positionwise_ff(x)
        return self.norm2(x + ff_out)

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, D, H):
        super().__init__()
        self.H = H
        self.D = D
        self.query = nn.Linear(D, D)
        self.key = nn.Linear(D, D)
        self.value = nn.Linear(D, D)
        self.proj = nn.Linear(D, D)
    
    def forward(self, x, mask=False):
        B, T, D = x.size()
        Q = self.query(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        K = self.key(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        V = self.value(x).view(B, T, self.H, D // self.H).transpose(1, 2)
        scores = Q @ K.transpose(-2, -1) / (D // self.H) ** 0.5
        if mask:  # Causal Mask 적용
            causal_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)
            scores = scores.masked_fill(causal_mask, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        context = (attn @ V).transpose(1, 2).contiguous().view(B, T, D)
        return self.proj(context)


In [12]:
from torchinfo import summary

def profile_model(model, input_shape, model_name="Model"):
    print(f"\n{model_name} Profiling:")
    print(summary(model, input_size=input_shape, depth=3, col_names=["output_size", "num_params", "trainable"], verbose=1))

In [10]:
# 파라미터 설정
T, D, H, L = 128, 512, 8, 6  # 시퀀스 길이, 임베딩 차원, 헤드 수, 레이어 수
batch_size = 16

# BERT와 GPT 모델 생성
bert_model = BERT(T, D, H, L)
gpt_model = GPT(T, D, H, L)

In [13]:
# BERT 모델 프로파일링
profile_model(bert_model, (batch_size, T, D), model_name="BERT (Encoder-only)")


BERT (Encoder-only) Profiling:
Layer (type:depth-idx)                        Output Shape              Param #                   Trainable
BERT                                          [16, 128, 512]            --                        True
├─ModuleList: 1-1                             --                        --                        True
│    └─EncoderLayer: 2-1                      [16, 128, 512]            --                        True
│    │    └─MultiHeadSelfAttention: 3-1       [16, 128, 512]            1,050,624                 True
│    │    └─LayerNorm: 3-2                    [16, 128, 512]            1,024                     True
│    │    └─Sequential: 3-3                   [16, 128, 512]            2,099,712                 True
│    │    └─LayerNorm: 3-4                    [16, 128, 512]            1,024                     True
│    └─EncoderLayer: 2-2                      [16, 128, 512]            --                        True
│    │    └─MultiHeadSelfAttention: 

In [14]:
# GPT 모델 프로파일링
profile_model(gpt_model, (batch_size, T, D), model_name="GPT (Decoder-only)")


GPT (Decoder-only) Profiling:
Layer (type:depth-idx)                        Output Shape              Param #                   Trainable
GPT                                           [16, 128, 512]            --                        True
├─ModuleList: 1-1                             --                        --                        True
│    └─DecoderLayer: 2-1                      [16, 128, 512]            --                        True
│    │    └─MultiHeadSelfAttention: 3-1       [16, 128, 512]            1,050,624                 True
│    │    └─LayerNorm: 3-2                    [16, 128, 512]            1,024                     True
│    │    └─Sequential: 3-3                   [16, 128, 512]            2,099,712                 True
│    │    └─LayerNorm: 3-4                    [16, 128, 512]            1,024                     True
│    └─DecoderLayer: 2-2                      [16, 128, 512]            --                        True
│    │    └─MultiHeadSelfAttention: 3