In [1]:
!pip install datasets transformers torch



In [7]:
from datasets import load_dataset
raw_dataset = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")  # train/validation/test
train_dataset = raw_dataset["train"]

In [10]:
raw_dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [8]:
train_dataset[:5]

{'text': ['',
  ' = Valkyria Chronicles III = \n',
  '',
  ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n',
  " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making th

In [12]:
all_sentences = []

for one_row in train_dataset:
  sentence = one_row['text']
  sentence = sentence.strip()
  if sentence != '':
    all_sentences.append(sentence)

print("전체 문장 개수:", len(all_sentences))
print("앞의 예시 문장 3개:", all_sentences[:3])

전체 문장 개수: 23767
앞의 예시 문장 3개: ['= Valkyria Chronicles III =', 'Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " .', "The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the gam

In [14]:
word_frequency = {}

for one_sentence in all_sentences:
  words = one_sentence.split()
  for one_word in words:
    token = " ".join(list(one_word)) + " </w>"

    if token not in word_frequency:
      word_frequency[token] = 1
    else:
      word_frequency[token] += 1

print("사전에 들어간 단어 개수:", len(word_frequency))
print("예시 5개:")
for index, (token, count) in enumerate(word_frequency.items()):
    if index >= 5:
        break
    print(token, ":", count)

사전에 들어간 단어 개수: 76616
예시 5개:
= </w> : 29570
V a l k y r i a </w> : 54
C h r o n i c l e s </w> : 47
I I I </w> : 231
S e n j ō </w> : 5


In [15]:
from collections import defaultdict

def get_pair_statistics(vocab):

  """
  vocab: {'t h e </w>': 1245, 'p l a y </w>': 45, ...}
  return: {('t', 'h'): 등장횟수, ('p', 'l'): 등장횟수, ...}
  """

  pair_freq = defaultdict(int)

  for word, freq in vocab.items():
    symbols = word.split()

    for i in range(len(symbols) - 1):
      left = symbols[i]
      right = symbols[i + 1]
      pair_freq[(left, right)] += freq

  return pair_freq

In [16]:
pair_stats = get_pair_statistics(word_frequency)

print("쌍(pair) 개수:", len(pair_stats))
print("예시 10개:")
for i, (pair, freq) in enumerate(pair_stats.items()):
    if i >= 10:
        break
    print(pair, ":", freq)


쌍(pair) 개수: 5504
예시 10개:
('=', '</w>') : 29570
('V', 'a') : 1021
('a', 'l') : 68991
('l', 'k') : 1329
('k', 'y') : 758
('y', 'r') : 972
('r', 'i') : 53566
('i', 'a') : 20723
('a', '</w>') : 56830
('C', 'h') : 5322


In [19]:
# 쌍(pair) 중에서 가장 자주 등장한 한 쌍 찾기
most_frequent_pair = max(pair_stats, key=pair_stats.get)
print("가장 자주 등장한 쌍:", most_frequent_pair, "→ 등장 횟수:", pair_stats[most_frequent_pair])


가장 자주 등장한 쌍: ('e', '</w>') → 등장 횟수: 315788


In [21]:
import re

def merge_vocab_pair(pair_to_merge, vocab_in):
    """
    pair_to_merge: ('a', 'l') 같은 쌍
    vocab_in: {'a l l </w>': 2, 'a l p h a </w>': 5, ...}
    return: 병합 후 새 vocab
    """

    # 정규식으로 'a l' 패턴을 찾기 (양쪽에 공백이 있는 경우만)
    pattern = re.escape(' '.join(pair_to_merge))
    pattern = r'(?<!\S)' + pattern + r'(?!\S)'

    new_vocab = {}
    for word, freq in vocab_in.items():
        # 'a l' → 'al' 로 교체
        new_word = re.sub(pattern, ''.join(pair_to_merge), word)
        new_vocab[new_word] = freq
    return new_vocab


In [22]:
# 가장 자주 등장한 쌍을 병합
merged_vocab = merge_vocab_pair(most_frequent_pair, word_frequency)

print("병합 전 단어 수:", len(word_frequency))
print("병합 후 단어 수:", len(merged_vocab))

# 병합 결과 예시 보기
for i, (word, freq) in enumerate(merged_vocab.items()):
    if i >= 5:
        break
    print(word, ":", freq)


병합 전 단어 수: 76616
병합 후 단어 수: 76616
= </w> : 29570
V a l k y r i a </w> : 54
C h r o n i c l e s </w> : 47
I I I </w> : 231
S e n j ō </w> : 5


In [23]:
import copy

def train_wordpiece(vocab, num_merges):
    """
    vocab: {'p l a y </w>': 3, 't h e </w>': 1245, ...}
    num_merges: 병합 횟수 (예: 10이면 10번 반복)
    """
    current_vocab = copy.deepcopy(vocab)
    merges = []  # 병합된 쌍들을 기록할 리스트

    for merge_step in range(num_merges):
        print(f"\n[{merge_step+1}번째 병합 단계]")

        # 1. 문자쌍 등장 횟수 계산
        pair_stats = get_pair_statistics(current_vocab)

        # 2. 가장 자주 등장한 쌍 선택
        best_pair = max(pair_stats, key=pair_stats.get)
        print("가장 자주 등장한 쌍:", best_pair, "→", pair_stats[best_pair], "번")

        # 3. 병합 수행
        current_vocab = merge_vocab_pair(best_pair, current_vocab)
        merges.append(best_pair)

        # 4. 예시 단어 몇 개만 출력
        print("병합 후 예시 단어 5개:")
        for i, (word, freq) in enumerate(current_vocab.items()):
            if i >= 5: break
            print(" ", word)

    return current_vocab, merges


In [24]:
final_vocab, merge_history = train_wordpiece(word_frequency, num_merges=10)

print("\n✅ 최종 병합된 쌍 10개:")
for i, pair in enumerate(merge_history):
    print(f"{i+1}. {pair}")



[1번째 병합 단계]
가장 자주 등장한 쌍: ('e', '</w>') → 315788 번
병합 후 예시 단어 5개:
  = </w>
  V a l k y r i a </w>
  C h r o n i c l e s </w>
  I I I </w>
  S e n j ō </w>

[2번째 병합 단계]
가장 자주 등장한 쌍: ('s', '</w>') → 228389 번
병합 후 예시 단어 5개:
  = </w>
  V a l k y r i a </w>
  C h r o n i c l e s</w>
  I I I </w>
  S e n j ō </w>

[3번째 병합 단계]
가장 자주 등장한 쌍: ('t', 'h') → 199390 번
병합 후 예시 단어 5개:
  = </w>
  V a l k y r i a </w>
  C h r o n i c l e s</w>
  I I I </w>
  S e n j ō </w>

[4번째 병합 단계]
가장 자주 등장한 쌍: ('d', '</w>') → 187275 번
병합 후 예시 단어 5개:
  = </w>
  V a l k y r i a </w>
  C h r o n i c l e s</w>
  I I I </w>
  S e n j ō </w>

[5번째 병합 단계]
가장 자주 등장한 쌍: ('n', '</w>') → 171069 번
병합 후 예시 단어 5개:
  = </w>
  V a l k y r i a </w>
  C h r o n i c l e s</w>
  I I I </w>
  S e n j ō </w>

[6번째 병합 단계]
가장 자주 등장한 쌍: ('e', 'r') → 137656 번
병합 후 예시 단어 5개:
  = </w>
  V a l k y r i a </w>
  C h r o n i c l e s</w>
  I I I </w>
  S e n j ō </w>

[7번째 병합 단계]
가장 자주 등장한 쌍: ('t', '</w>') → 128094 번
병합 후 예시 단어 5개:
  = </w>
  V a 

In [25]:
def tokenize_by_wordpiece(text, merge_rules):
    """
    text: 입력 문장 (예: "Valkyria")
    merge_rules: [('a','l'), ('r','i'), ('a','</w>'), ...] 병합 순서
    """
    # 1️⃣ 문자 단위로 쪼개기 + 단어 끝에 </w> 붙이기
    symbols = list(text) + ["</w>"]
    print(f"\n[초기 문자 단위 분해]\n{text} → {' '.join(symbols)}")

    # 2️⃣ 병합 규칙 순서대로 적용
    for step, (left, right) in enumerate(merge_rules):
        i = 0
        new_symbols = []
        while i < len(symbols):
            # 현재 문자와 다음 문자를 병합 대상과 비교
            if i < len(symbols) - 1 and symbols[i] == left and symbols[i + 1] == right:
                merged = left + right
                new_symbols.append(merged)
                i += 2  # 두 글자 건너뜀
            else:
                new_symbols.append(symbols[i])
                i += 1

        symbols = new_symbols  # 갱신
        print(f"[{step+1:02d}단계 병합 후] {' '.join(symbols)}")

    # 3️⃣ 마지막에 </w> 제거하고 결과 반환
    if symbols[-1] == "</w>":
        symbols = symbols[:-1]
    return symbols


In [26]:
test_word = "Valkyria"
result_tokens = tokenize_by_wordpiece(test_word, merge_history[:10])  # 앞의 10개 규칙만 사용

print("\n최종 토큰화 결과:", result_tokens)



[초기 문자 단위 분해]
Valkyria → V a l k y r i a </w>
[01단계 병합 후] V a l k y r i a </w>
[02단계 병합 후] V a l k y r i a </w>
[03단계 병합 후] V a l k y r i a </w>
[04단계 병합 후] V a l k y r i a </w>
[05단계 병합 후] V a l k y r i a </w>
[06단계 병합 후] V a l k y r i a </w>
[07단계 병합 후] V a l k y r i a </w>
[08단계 병합 후] V a l k y r i a </w>
[09단계 병합 후] V a l k y r i a </w>
[10단계 병합 후] V a l k y r i a </w>

최종 토큰화 결과: ['V', 'a', 'l', 'k', 'y', 'r', 'i', 'a']


In [27]:
def tokenize_sentence(sentence, merge_rules):
    words = sentence.split()
    all_tokens = []
    for word in words:
        tokens = tokenize_by_wordpiece(word, merge_rules)
        all_tokens.extend(tokens + ["|"])  # 단어 구분용 | 추가 (보기 편하게)
    return all_tokens


In [28]:
example_sentence = "Valkyria battle series"
final_tokens = tokenize_sentence(example_sentence, merge_history[:10])

print("\n전체 문장 토큰화 결과:\n", final_tokens)



[초기 문자 단위 분해]
Valkyria → V a l k y r i a </w>
[01단계 병합 후] V a l k y r i a </w>
[02단계 병합 후] V a l k y r i a </w>
[03단계 병합 후] V a l k y r i a </w>
[04단계 병합 후] V a l k y r i a </w>
[05단계 병합 후] V a l k y r i a </w>
[06단계 병합 후] V a l k y r i a </w>
[07단계 병합 후] V a l k y r i a </w>
[08단계 병합 후] V a l k y r i a </w>
[09단계 병합 후] V a l k y r i a </w>
[10단계 병합 후] V a l k y r i a </w>

[초기 문자 단위 분해]
battle → b a t t l e </w>
[01단계 병합 후] b a t t l e</w>
[02단계 병합 후] b a t t l e</w>
[03단계 병합 후] b a t t l e</w>
[04단계 병합 후] b a t t l e</w>
[05단계 병합 후] b a t t l e</w>
[06단계 병합 후] b a t t l e</w>
[07단계 병합 후] b a t t l e</w>
[08단계 병합 후] b a t t l e</w>
[09단계 병합 후] b a t t l e</w>
[10단계 병합 후] b a t t l e</w>

[초기 문자 단위 분해]
series → s e r i e s </w>
[01단계 병합 후] s e r i e s </w>
[02단계 병합 후] s e r i e s</w>
[03단계 병합 후] s e r i e s</w>
[04단계 병합 후] s e r i e s</w>
[05단계 병합 후] s e r i e s</w>
[06단계 병합 후] s er i e s</w>
[07단계 병합 후] s er i e s</w>
[08단계 병합 후] s er i e s</w>
[09단계 병합 후] s er i e s</w>
[10단계 병합 후] 

In [29]:
# 특수 토큰 5개를 먼저 정의합니다.
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

# 사전 초기화
vocab_dict = {}
for index, token in enumerate(special_tokens):
    vocab_dict[token] = index

print("초기 vocab (특수 토큰 5개):")
print(vocab_dict)


초기 vocab (특수 토큰 5개):
{'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}


In [30]:
# 병합 규칙 기반으로 새로운 토큰 추가
start_index = len(vocab_dict)

for i, (left, right) in enumerate(merge_history):
    new_token = left + right
    vocab_dict[new_token] = start_index + i

print("\n병합된 토큰 10개 예시:")
for idx, (token, id_) in enumerate(vocab_dict.items()):
    if idx < 15:
        print(token, ":", id_)



병합된 토큰 10개 예시:
[PAD] : 0
[UNK] : 1
[CLS] : 2
[SEP] : 3
[MASK] : 4
e</w> : 5
s</w> : 6
th : 7
d</w> : 8
n</w> : 9
er : 10
t</w> : 11
the</w> : 12
in : 13
an : 14


In [31]:
print("\n최종 Vocab 크기:", len(vocab_dict))



최종 Vocab 크기: 15


In [32]:
def convert_tokens_to_ids(tokens, vocab):
    """
    tokens: ['V', 'al', 'ky', 'ria']
    vocab: 우리가 만든 vocab_dict
    return: [숫자 리스트]
    """
    token_ids = []
    for t in tokens:
        if t in vocab:
            token_ids.append(vocab[t])
        else:
            token_ids.append(vocab["[UNK]"])  # 모르는 토큰은 [UNK]
    return token_ids


In [33]:
test_word = "Valkyria"
tokens = tokenize_by_wordpiece(test_word, merge_history[:50])
print("\n토큰:", tokens)

token_ids = convert_tokens_to_ids(tokens, vocab_dict)
print("ID 변환 결과:", token_ids)



[초기 문자 단위 분해]
Valkyria → V a l k y r i a </w>
[01단계 병합 후] V a l k y r i a </w>
[02단계 병합 후] V a l k y r i a </w>
[03단계 병합 후] V a l k y r i a </w>
[04단계 병합 후] V a l k y r i a </w>
[05단계 병합 후] V a l k y r i a </w>
[06단계 병합 후] V a l k y r i a </w>
[07단계 병합 후] V a l k y r i a </w>
[08단계 병합 후] V a l k y r i a </w>
[09단계 병합 후] V a l k y r i a </w>
[10단계 병합 후] V a l k y r i a </w>

토큰: ['V', 'a', 'l', 'k', 'y', 'r', 'i', 'a']
ID 변환 결과: [1, 1, 1, 1, 1, 1, 1, 1]


In [34]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [35]:
print(train_dataset)
print(train_dataset[0])


Dataset({
    features: ['text'],
    num_rows: 36718
})
{'text': ''}


In [36]:
example_text = train_dataset[3]["text"]
print("원문:", example_text)

encoded = tokenizer(example_text)
print("토큰 ID:", encoded["input_ids"])
print("토큰 리스트:", tokenizer.convert_ids_to_tokens(encoded["input_ids"]))

원문:  Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 

토큰 ID: [101, 12411, 5558, 2053, 11748, 4801, 4360, 1017, 1024, 4895, 2890, 27108, 5732, 11906, 1006, 2887, 1024, 1856, 1806, 1671, 30222, 30218, 30259, 30227, 30255, 30258, 30219, 2509, 1010, 5507, 1012, 11748, 4801, 4360, 1997, 1996, 11686, 1017, 1007, 1010, 4141, 3615, 2000, 2004, 11748

In [37]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",      # 모든 문장을 같은 길이로
        truncation=True,           # 너무 긴 문장은 잘라내기
        max_length=64              # 최대 길이 (원하면 조절)
    )

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

In [38]:
print(tokenized_dataset[0])


{'text': '', 'input_ids': [101, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [39]:
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids"])


In [40]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_dataset, batch_size=8, shuffle=True)
batch = next(iter(train_loader))
print(batch["input_ids"].shape)   # (8, 64)


torch.Size([8, 64])


In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [42]:
hidden_size = 256                # 임베딩 차원(D)
number_of_layers = 4             # 인코더 레이어 개수(L)
number_of_heads = 4              # 멀티헤드 개수(h). D % h == 0 이어야 합니다.
intermediate_size = 1024         # FFN 내부 차원(보통 4*D)
max_position_embeddings = 512    # 포지션 임베딩 길이
type_vocab_size = 2              # 세그먼트 A/B
layer_norm_eps = 1e-12           # LayerNorm 안정화용
dropout_prob = 0.1               # 드롭아웃 확률

vocab_size = tokenizer.vocab_size   # 토크나이저의 어휘 크기를 그대로 씁니다.


In [43]:
class BertEmbeddings(nn.Module):
    def __init__(self,
                 vocab_size,
                 hidden_size,
                 max_position_embeddings,
                 type_vocab_size,
                 layer_norm_eps,
                 dropout_prob):
        super().__init__()  # 부모 클래스 초기화

        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)             # 단어 임베딩
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)  # 위치 임베딩
        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)  # 세그먼트 임베딩

        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)          # 정규화
        self.dropout = nn.Dropout(dropout_prob)                                  # 드롭아웃

    def forward(self, input_ids, token_type_ids=None):
        batch_size, seq_len = input_ids.size()                                   # 크기 읽기

        if token_type_ids is None:                                               # 세그먼트가 없으면
            token_type_ids = torch.zeros_like(input_ids)                         # 전부 0으로

        position_ids = torch.arange(seq_len, device=input_ids.device)            # [0..S-1]
        position_ids = position_ids.unsqueeze(0).expand(batch_size, seq_len)     # (B,S)

        word = self.word_embeddings(input_ids)                                   # (B,S,D)
        pos = self.position_embeddings(position_ids)                             # (B,S,D)
        tok = self.token_type_embeddings(token_type_ids)                         # (B,S,D)

        x = word + pos + tok                                                     # 합치기
        x = self.layer_norm(x)                                                   # 정규화
        x = self.dropout(x)                                                      # 드롭아웃
        return x                                                                 # (B,S,D)

In [44]:
def build_attention_mask(attention_mask):
    """
    attention_mask: (B,S), 1=유효, 0=패딩
    반환: (B,1,1,S) 형태의 가중치. 0 위치는 큰 음수로 만들어 softmax에서 무시하게 함.
    """
    if attention_mask is None:
        return None
    extended = attention_mask[:, None, None, :]             # (B,1,1,S)
    extended = (1.0 - extended) * -10000.0                  # 유효=0, 패딩=-10000
    return extended


In [45]:
class BertSelfAttention(nn.Module):
    def __init__(self, hidden_size, number_of_heads, dropout_prob):
        super().__init__()
        self.number_of_heads = number_of_heads
        self.head_dim = hidden_size // number_of_heads
        assert self.head_dim * number_of_heads == hidden_size  # 나누어떨어지는지 확인

        self.query = nn.Linear(hidden_size, hidden_size)       # Q 투영
        self.key   = nn.Linear(hidden_size, hidden_size)       # K 투영
        self.value = nn.Linear(hidden_size, hidden_size)       # V 투영

        self.dropout = nn.Dropout(dropout_prob)                # 어텐션 확률 드롭아웃

    def _split_heads(self, x):
        # (B,S,D) -> (B,h,S,d)
        B, S, D = x.size()
        x = x.view(B, S, self.number_of_heads, self.head_dim).transpose(1, 2)
        return x

    def _merge_heads(self, x):
        # (B,h,S,d) -> (B,S,D)
        B, h, S, d = x.size()
        x = x.transpose(1, 2).contiguous().view(B, S, h * d)
        return x

    def forward(self, hidden_states, extended_attention_mask=None):
        q = self._split_heads(self.query(hidden_states))                # (B,h,S,d)
        k = self._split_heads(self.key(hidden_states))                  # (B,h,S,d)
        v = self._split_heads(self.value(hidden_states))                # (B,h,S,d)

        scores = torch.matmul(q, k.transpose(-1, -2)) / (self.head_dim ** 0.5)  # (B,h,S,S)

        if extended_attention_mask is not None:                         # 패딩 가리기
            scores = scores + extended_attention_mask                   # (B,h,S,S)

        probs = F.softmax(scores, dim=-1)                               # 어텐션 분포
        probs = self.dropout(probs)                                     # 드롭아웃

        context = torch.matmul(probs, v)                                # (B,h,S,d)
        context = self._merge_heads(context)                            # (B,S,D)
        return context


In [46]:
class BertSelfOutput(nn.Module):
    def __init__(self, hidden_size, dropout_prob, layer_norm_eps):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)               # 출력 투영
        self.dropout = nn.Dropout(dropout_prob)
        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)

    def forward(self, attention_output, input_tensor):
        x = self.dense(attention_output)                               # 선형
        x = self.dropout(x)                                            # 드롭아웃
        x = self.layer_norm(x + input_tensor)                          # 잔차 연결 + 정규화
        return x


In [47]:
class BertIntermediate(nn.Module):
    def __init__(self, hidden_size, intermediate_size):
        super().__init__()
        self.dense = nn.Linear(hidden_size, intermediate_size)         # D -> 4D
        self.act = nn.GELU()                                           # GELU 활성

    def forward(self, x):
        return self.act(self.dense(x))


In [48]:
class BertOutput(nn.Module):
    def __init__(self, intermediate_size, hidden_size, dropout_prob, layer_norm_eps):
        super().__init__()
        self.dense = nn.Linear(intermediate_size, hidden_size)         # 4D -> D
        self.dropout = nn.Dropout(dropout_prob)
        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)

    def forward(self, ff_output, input_tensor):
        x = self.dense(ff_output)                                      # 선형
        x = self.dropout(x)                                            # 드롭아웃
        x = self.layer_norm(x + input_tensor)                          # 잔차 + 정규화
        return x


In [49]:
class BertLayer(nn.Module):
    def __init__(self, hidden_size, number_of_heads, intermediate_size, dropout_prob, layer_norm_eps):
        super().__init__()
        self.attn = BertSelfAttention(hidden_size, number_of_heads, dropout_prob)
        self.attn_output = BertSelfOutput(hidden_size, dropout_prob, layer_norm_eps)
        self.intermediate = BertIntermediate(hidden_size, intermediate_size)
        self.output = BertOutput(intermediate_size, hidden_size, dropout_prob, layer_norm_eps)

    def forward(self, hidden_states, extended_attention_mask):
        attn_context = self.attn(hidden_states, extended_attention_mask)       # (B,S,D)
        attn_out = self.attn_output(attn_context, hidden_states)               # (B,S,D)
        inter = self.intermediate(attn_out)                                    # (B,S,4D)
        layer_out = self.output(inter, attn_out)                               # (B,S,D)
        return layer_out


In [50]:
class BertEncoder(nn.Module):
    def __init__(self, hidden_size, number_of_layers, number_of_heads, intermediate_size, dropout_prob, layer_norm_eps):
        super().__init__()
        self.layers = nn.ModuleList([
            BertLayer(hidden_size, number_of_heads, intermediate_size, dropout_prob, layer_norm_eps)
            for _ in range(number_of_layers)
        ])

    def forward(self, hidden_states, extended_attention_mask):
        for layer in self.layers:
            hidden_states = layer(hidden_states, extended_attention_mask)       # 레이어 반복
        return hidden_states


In [51]:
class BertPooler(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.Tanh()

    def forward(self, sequence_output):
        cls = sequence_output[:, 0]                     # 첫 토큰(CLS) 벡터
        pooled = self.activation(self.dense(cls))       # Linear+tanh
        return pooled


In [52]:
class BertModelMini(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = BertEmbeddings(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            max_position_embeddings=max_position_embeddings,
            type_vocab_size=type_vocab_size,
            layer_norm_eps=layer_norm_eps,
            dropout_prob=dropout_prob
        )
        self.encoder = BertEncoder(
            hidden_size=hidden_size,
            number_of_layers=number_of_layers,
            number_of_heads=number_of_heads,
            intermediate_size=intermediate_size,
            dropout_prob=dropout_prob,
            layer_norm_eps=layer_norm_eps
        )
        self.pooler = BertPooler(hidden_size=hidden_size)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        x = self.emb(input_ids, token_type_ids)                         # (B,S,D)
        ext_mask = build_attention_mask(attention_mask)                 # (B,1,1,S) or None
        seq_out = self.encoder(x, ext_mask)                             # (B,S,D)
        pooled = self.pooler(seq_out)                                   # (B,D)
        return seq_out, pooled


In [53]:
# 간단 문장 두 개를 토크나이저로 인코딩합니다.
encoded = tokenizer(
    ["hello bert model", "this is a tiny check"],
    padding="max_length",
    truncation=True,
    max_length=16,
    return_tensors="pt"
)

# 모델 생성 및 실행
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertModelMini().to(device)
sequence_output, pooled_output = model(
    input_ids=encoded["input_ids"].to(device),
    attention_mask=encoded["attention_mask"].to(device),
    token_type_ids=encoded["token_type_ids"].to(device)
)

print("sequence_output:", sequence_output.shape)  # 기대: (2, 16, 256)
print("pooled_output:", pooled_output.shape)      # 기대: (2, 256)


sequence_output: torch.Size([2, 16, 256])
pooled_output: torch.Size([2, 256])


In [54]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

sample_texts = [
    "Hello BERT, how are you?",
    "We are building BERT from scratch!"
]

encoded_inputs = tokenizer(
    sample_texts,
    padding="max_length",   # 길이 맞추기
    truncation=True,        # 너무 긴 문장은 자르기
    max_length=16,
    return_tensors="pt"     # PyTorch 텐서로
)

print(encoded_inputs.keys())
print(encoded_inputs["input_ids"].shape)  # (2, 16)


KeysView({'input_ids': tensor([[  101,  7592, 14324,  1010,  2129,  2024,  2017,  1029,   102,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2057,  2024,  2311, 14324,  2013, 11969,   999,   102,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])})
torch.Size([2, 16])


In [55]:
# 모델 생성 (앞서 만든 BertModelMini)
model = BertModelMini()

sequence_output, pooled_output = model(
    input_ids=encoded_inputs["input_ids"],
    attention_mask=encoded_inputs["attention_mask"],
    token_type_ids=encoded_inputs["token_type_ids"]
)

print("sequence_output:", sequence_output.shape)
print("pooled_output:", pooled_output.shape)


sequence_output: torch.Size([2, 16, 256])
pooled_output: torch.Size([2, 256])


In [56]:
text = "BERT model test"
tokens = tokenizer.tokenize(text)
ids = tokenizer.encode(text, return_tensors="pt")

sequence_output, pooled_output = model(ids)

print("입력 토큰:", tokens)
print("CLS 벡터(문장 표현):", pooled_output[0][:10])  # 앞부분 10차원만 보기
print("첫 단어 벡터 크기:", sequence_output[0, 1].shape)


입력 토큰: ['bert', 'model', 'test']
CLS 벡터(문장 표현): tensor([-0.1715, -0.2545, -0.4214,  0.3750,  0.0172, -0.5693,  0.3434, -0.0537,
        -0.1521, -0.0163], grad_fn=<SliceBackward0>)
첫 단어 벡터 크기: torch.Size([256])


In [57]:
class BertPredictionHeadTransform(nn.Module):
    def __init__(self, hidden_size, layer_norm_eps):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.act = nn.GELU()
        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        return hidden_states


In [58]:
class BertLMPredictionHead(nn.Module):
    def __init__(self, hidden_size, vocab_size, layer_norm_eps):
        super().__init__()
        self.transform = BertPredictionHeadTransform(hidden_size, layer_norm_eps)
        self.decoder = nn.Linear(hidden_size, vocab_size, bias=False)  # 가중치 타이잉 대상
        self.bias = nn.Parameter(torch.zeros(vocab_size))              # 출력 바이어스

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)     # (B,S,D)
        logits = self.decoder(hidden_states) + self.bias  # (B,S,V)
        return logits


In [59]:
class BertForMaskedLMMini(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.bert = base_model                                     # 우리가 만든 BertModelMini
        self.cls = BertLMPredictionHead(hidden_size, vocab_size, layer_norm_eps)
        # 가중치 타이잉: decoder.weight ↔ word_embeddings.weight
        self.cls.decoder.weight = self.bert.emb.word_embeddings.weight

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        sequence_output, _ = self.bert(input_ids, attention_mask, token_type_ids)  # (B,S,D)
        logits = self.cls(sequence_output)                                         # (B,S,V)

        loss = None
        if labels is not None:
            # CrossEntropyLoss(ignore_index=-100) → -100 위치는 로스 제외
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return logits, loss


In [84]:
def make_mlm_inputs_and_labels(input_ids, tokenizer, mlm_probability=0.15):
    """
    input_ids: (B,S) LongTensor
    반환: masked_input_ids, labels  (둘 다 (B,S))
    규칙: 15% 위치만 정답으로 학습
          그중 80% → [MASK], 10% → 랜덤 토큰, 10% → 원래 토큰 유지
    """
    input_ids = input_ids.clone()
    labels = input_ids.clone()

    special_ids = set([
        tokenizer.cls_token_id,
        tokenizer.sep_token_id,
        tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1
    ])

    # 마스크 후보(스페셜 토큰 제외)
    probability_matrix = torch.full(labels.shape, mlm_probability, device=labels.device)
    special_mask = torch.zeros_like(labels, dtype=torch.bool, device=labels.device) # special_mask도 labels와 동일 장치로 생성
    for sid in special_ids:
        if sid >= 0:
            special_mask |= (labels == sid)

    probability_matrix.masked_fill_(special_mask, 0.0)

    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # 정답으로 쓰지 않을 위치는 -100

    # 80%: [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8, device=labels.device)).bool() & masked_indices # torch.full 결과도 labels와 동일 장치로
    input_ids[indices_replaced] = tokenizer.mask_token_id

    # 10%: 랜덤 토큰
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5, device=labels.device)).bool() & masked_indices & ~indices_replaced # torch.full 결과도 labels와 동일 장치로
    random_words = torch.randint(vocab_size, labels.shape, dtype=torch.long, device=labels.device) # 랜덤 토큰도 동일 장치로
    input_ids[indices_random] = random_words[indices_random]

    # 나머지 10%: 원래 토큰 유지 (입력 그대로)

    return input_ids, labels

In [72]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

sample_sentences = [
    "this is a tiny masked language modeling test",
    "we are building a bert model from scratch"
]

encoded_batch = tokenizer(
    sample_sentences,
    padding="max_length",
    truncation=True,
    max_length=24,
    return_tensors="pt"
)

masked_input_ids, mlm_labels = make_mlm_inputs_and_labels(encoded_batch["input_ids"], tokenizer)
attention_mask = encoded_batch["attention_mask"]
token_type_ids = encoded_batch["token_type_ids"]


In [73]:
# 모델 만들기
base_model = BertModelMini()
mlm_model = BertForMaskedLMMini(base_model)

# 옵티마이저
optimizer = torch.optim.AdamW(mlm_model.parameters(), lr=5e-5)

# 순전파 → 손실
mlm_model.train()
logits, loss = mlm_model(
    input_ids=masked_input_ids,
    attention_mask=attention_mask,
    token_type_ids=token_type_ids,
    labels=mlm_labels
)
print("초기 손실:", float(loss))


초기 손실: 62.507568359375


In [74]:
# 역전파 → 가중치 갱신
loss.backward()
optimizer.step()
optimizer.zero_grad()
print("한 스텝 업데이트 완료 ✅")


한 스텝 업데이트 완료 ✅


In [75]:
# 추론 모드로 전환
mlm_model.eval()
with torch.no_grad():
    logits, _ = mlm_model(
        input_ids=masked_input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
    )
    predictions = logits.argmax(-1)  # (B,S)

# 첫 문장 기준으로, 마스크된 위치만 비교 출력
batch_index = 0
masked_positions = (mlm_labels[batch_index] != -100).nonzero(as_tuple=True)[0].tolist()

print("\n[예측 결과(일부 마스크 위치)]")
for pos in masked_positions[:5]:  # 5개까지만 보기
    pred_id = predictions[batch_index, pos].item()
    gold_id = mlm_labels[batch_index, pos].item()
    print(f"pos={pos:2d}  pred={tokenizer.decode([pred_id])!r}  gold={tokenizer.decode([gold_id])!r}")



[예측 결과(일부 마스크 위치)]
pos= 1  pred='##チ'  gold='this'
pos= 2  pred='mo'  gold='is'


In [76]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [77]:
# 문장을 고정 길이로 인코딩합니다.
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=64   # 필요에 따라 128/256으로 조정 가능
    )

tokenized_training_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


In [78]:
# PyTorch 텐서로 바로 쓰기 위해 포맷 설정
tokenized_training_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids"]
)

print("샘플 확인:", {k: v.shape if hasattr(v, "shape") else type(v) for k, v in tokenized_training_dataset[0].items()})


샘플 확인: {'input_ids': torch.Size([64]), 'token_type_ids': torch.Size([64]), 'attention_mask': torch.Size([64])}


In [79]:
from torch.utils.data import DataLoader

batch_size = 32
training_loader = DataLoader(tokenized_training_dataset, batch_size=batch_size, shuffle=True)

# 배치 모양 확인
first_batch = next(iter(training_loader))
for key, value in first_batch.items():
    print(key, value.shape)


input_ids torch.Size([32, 64])
token_type_ids torch.Size([32, 64])
attention_mask torch.Size([32, 64])


In [80]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = BertModelMini().to(device)
mlm_model = BertForMaskedLMMini(base_model).to(device)

optimizer = torch.optim.AdamW(mlm_model.parameters(), lr=5e-5)


In [85]:
mlm_model.train()

max_train_steps = 300          # 처음엔 가볍게 300스텝만
log_interval = 50              # 50스텝마다 로그 출력

step_counter = 0

# 데이터 로더를 다시 초기화하여 처음부터 학습 시작
training_loader = DataLoader(tokenized_training_dataset, batch_size=batch_size, shuffle=True)


for one_batch in training_loader:
    if step_counter >= max_train_steps:
        break

    input_ids = one_batch["input_ids"].to(device)
    attention_mask = one_batch["attention_mask"].to(device)
    token_type_ids = one_batch["token_type_ids"].to(device)

    # 표준 15% 마스킹을 즉석 적용
    masked_input_ids, mlm_labels = make_mlm_inputs_and_labels(input_ids, tokenizer)

    masked_input_ids = masked_input_ids.to(device)
    mlm_labels = mlm_labels.to(device)

    # 순전파
    logits, loss = mlm_model(
        input_ids=masked_input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        labels=mlm_labels
    )

    # 역전파
    loss.backward()
    torch.nn.utils.clip_grad_norm_(mlm_model.parameters(), max_norm=1.0)  # 안정성용(선택)
    optimizer.step()
    optimizer.zero_grad()

    step_counter += 1
    if step_counter % log_interval == 0:
        print(f"[step {step_counter:4d}] loss = {float(loss):.4f}")

[step   50] loss = 40.1848
[step  100] loss = 36.0620
[step  150] loss = 32.3814
[step  200] loss = 32.8065
[step  250] loss = 29.4431
[step  300] loss = 29.2984


In [87]:
mlm_model.eval()
with torch.no_grad():
    # 새 문장을 준비합니다.
    sample_sentences = ["this is a simple masked language modeling check"]
    batch_for_eval = tokenizer(sample_sentences, padding="max_length", truncation=True, max_length=20, return_tensors="pt")

    # 평가에서도 마스크를 적용해봅니다.
    eval_masked_input_ids, eval_labels = make_mlm_inputs_and_labels(batch_for_eval["input_ids"], tokenizer)

    logits, _ = mlm_model(
        input_ids=eval_masked_input_ids.to(device),
        attention_mask=batch_for_eval["attention_mask"].to(device),
        token_type_ids=batch_for_eval["token_type_ids"].to(device)
    )
    predictions = logits.argmax(-1).cpu()

# 마스크된 위치들만 몇 개 출력
masked_positions = (eval_labels[0] != -100).nonzero(as_tuple=True)[0].tolist()
print("\n[예측 결과 — 마스크된 위치 5개]")
for pos in masked_positions[:5]:
    gold_id = eval_labels[0, pos].item()
    pred_id = predictions[0, pos].item()
    print(f"pos={pos:2d}  pred={tokenizer.decode([pred_id])!r}  gold={tokenizer.decode([gold_id])!r}")



[예측 결과 — 마스크된 위치 5개]
pos= 3  pred='in'  gold='a'
