# 构建BERT(Transformer Encoder)模型

# 1 使用pytorch Dataset格式读取数据

In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import math

input_path="./data/bert_output_data2.json"
class MyDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                self.data.append(data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "input_mask": torch.tensor(item["input_mask"]),
            "segment_ids": torch.tensor(item["segment_ids"]),
            "masked_lm_ids": torch.tensor(item["masked_lm_ids"]),
            "masked_lm_positions": torch.tensor(item["masked_lm_positions"]),
            "masked_lm_weights": torch.tensor(item["masked_lm_weights"]),
            "next_sentence_labels": torch.tensor(item["next_sentence_labels"])
        }

In [2]:
train_dataset = MyDataset(input_path)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
for data in train_loader:
    data
    break
print(data["input_ids"].shape)
print(data["input_mask"].shape)
data

torch.Size([8, 512])
torch.Size([8, 512])


{'input_ids': tensor([[ 101, 1128, 5425,  ...,  100,  100,  102],
         [ 101,  678, 7481,  ...,    0,    0,    0],
         [ 101, 2226, 5052,  ..., 6405, 7415,  102],
         ...,
         [ 101,  100, 8024,  ...,    0,    0,    0],
         [ 101, 1062, 2398,  ..., 2832, 6598,  102],
         [ 101, 3315,  782,  ...,    0,    0,    0]]),
 'input_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'segment_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'masked_lm_ids': tensor([[5722, 4638, 2466, 3144, 4276, 1372,  128, 2972,  100,  868, 5445, 3749,
          6237,  100,  100,  100, 8024,  100,  100,  100],
         [1963

# BERT 模型

In [3]:
class BertConfig:
    def __init__(self, vocab_size, hidden_size=768, num_hidden_layers=12, num_attention_heads=12,
                 intermediate_size=3072, hidden_act='gelu', hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1, max_position_embeddings=512,
                 type_vocab_size=2, initializer_range=0.02):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('/Users/wangaijun/pythoncode/github/model/bert-base-chinese')
vocab_words = list(tokenizer.vocab.keys())
config=BertConfig(len(vocab_words))
config.vocab_size

21128

# 2.1 Embedding层

In [4]:
input_ids=data["input_ids"]
token_type_ids=data["segment_ids"]
print("input_ids shape:",input_ids.shape,"token_type_ids shape",token_type_ids.shape)

input_ids shape: torch.Size([8, 512]) token_type_ids shape torch.Size([8, 512])


In [5]:
word_embeddings = nn.Embedding(21128, config.hidden_size)
position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

token_emb=word_embeddings(input_ids)
position_emb=position_embeddings(torch.arange(input_ids.shape[1], dtype=torch.long, device=input_ids.device))
sentence_emb=token_type_embeddings(token_type_ids)

print(token_emb.shape,position_emb.shape,sentence_emb.shape)

torch.Size([8, 512, 768]) torch.Size([512, 768]) torch.Size([8, 512, 768])


In [6]:
class BertEmbeddings(nn.Module):
    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings



In [7]:
embModel=BertEmbeddings(config)
x_emb=embModel(input_ids,token_type_ids)
x_emb.shape

torch.Size([8, 512, 768])

# 2.2 attention 层

### 2.2.1 q,k,v加工及多头变换

In [8]:
c_attn = nn.Linear(config.hidden_size, 3 * config.hidden_size)

# q,k,v 都来自于x 
q, k, v  = c_attn(x_emb).split(config.hidden_size, dim=2)
print(q.shape,k.shape,v.shape)

B,T,C=x_emb.shape
# 给q,k,v 增加head
q = q.view(B, T, config.num_attention_heads, C // config.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)
k = k.view(B, T, config.num_attention_heads, C // config.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, config.num_attention_heads, C // config.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)
print(q.shape,k.shape,v.shape)

torch.Size([8, 512, 768]) torch.Size([8, 512, 768]) torch.Size([8, 512, 768])
torch.Size([8, 12, 512, 64]) torch.Size([8, 12, 512, 64]) torch.Size([8, 12, 512, 64])


#### 2.2.2 attention score 计算以及mask 

In [9]:

attention_scores = torch.matmul(q, k.transpose(-1, -2))/ math.sqrt(q.shape[-1])
print("attention_scores shape :",attention_scores.shape)

attention_mask=data["input_mask"]
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
print("attention_mask shape :",attention_mask.shape)

attention_scores = attention_scores.masked_fill(attention_mask == 0, float('-inf'))

# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)

context_layer = torch.matmul(attention_probs, v)
context_layer = context_layer.transpose(1, 2).contiguous().view(B, T, C) 
context_layer.shape

attention_scores shape : torch.Size([8, 12, 512, 512])
attention_mask shape : torch.Size([8, 1, 1, 512])


torch.Size([8, 512, 768])

In [10]:
class BertSelfAttention(nn.Module):
    def __init__(self, config):
        super(BertSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads!= 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention heads (%d)" % (
                    config.hidden_size, config.num_attention_heads))
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.hidden_size=config.hidden_size

        self.query = nn.Linear(config.hidden_size, self.hidden_size)
        self.key = nn.Linear(config.hidden_size, self.hidden_size)
        self.value = nn.Linear(config.hidden_size, self.hidden_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)


    def forward(self, hidden_states, attention_mask):
        B, T, C = hidden_states.size()
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        
        query_layer = mixed_query_layer.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)
        key_layer = mixed_key_layer.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)
        value_layer = mixed_value_layer.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        if attention_mask is not None:
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
            attention_scores = attention_scores.masked_fill(attention_mask == 0, float('-inf'))

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.transpose(1, 2).contiguous().view(B, T, C) 
        return context_layer

In [11]:
input_mask=data["input_mask"]
print("input_mask shape",input_mask.shape)
attenModel=BertSelfAttention(config)
x_att=attenModel(x_emb,input_mask)
x_att.shape

input_mask shape torch.Size([8, 512])


torch.Size([8, 512, 768])

## 3 attention +add&Norm
<div style="text-align: center;">
  <img src="images/encoder_atten.png" alt="Image" style="width:300px;">
</div>

In [12]:
class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

class BertAttention(nn.Module):
    def __init__(self, config):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output

In [13]:
bertAttModel=BertAttention(config)
x_att=bertAttModel(x_emb,input_mask)
x_att.shape

torch.Size([8, 512, 768])

## 4 FFN层
<div style="text-align: center;">
  <img src="images/encoder_mlp.png" alt="Image" style="width:300px;">
</div>

In [14]:
class FFN(nn.Module):
    def __init__(self, config):
        super(FFN, self).__init__()
        self.dense1 = nn.Linear(config.hidden_size, config.intermediate_size)        
        self.dense2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
    def gelu(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

    def forward(self, x):
        hidden_states=x
        hidden_states = self.dense1(hidden_states)
        hidden_states = self.gelu(hidden_states)
        hidden_states = self.dense2(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + x)
        return hidden_states

In [15]:
mlp=FFN(config)
x_mlp=mlp(x_att)
x_mlp.shape

torch.Size([8, 512, 768])

## 4 Block层
<div style="text-align: center;">
  <img src="images/encoder_block.png" alt="Image" style="width:300px;">
</div>

In [16]:
# BLOCK 层
class BertLayer(nn.Module):
    def __init__(self, config):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(config)
        self.mlp = FFN(config)
       
    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        layer_output = self.mlp(attention_output)
    
        return layer_output

# 多层BLOCK层
class BertEncoder(nn.Module):
    def __init__(self, config):
        super(BertEncoder, self).__init__()
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
        return all_encoder_layers

In [17]:
bert_model=BertEncoder(config)
x_bloks=bert_model(x_emb,input_mask)
x_bloks[-1].shape

torch.Size([8, 512, 768])

## 5 CLS 输出层

In [18]:
class BertPooler(nn.Module):
    def __init__(self, config):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

In [19]:
clsModel=BertPooler(config)
x_cls=clsModel(x_bloks[-1])
x_cls.shape

torch.Size([8, 768])

In [20]:
class BertModel(nn.Module):
    def __init__(self, config):
        super(BertModel, self).__init__()
        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        embedding_output = self.embeddings(input_ids, token_type_ids)
       
        encoded_layers = self.encoder(embedding_output, attention_mask)
        sequence_output = encoded_layers[-1]
        pooled_output = self.pooler(sequence_output)
        return pooled_output, sequence_output, encoded_layers

bert_model = BertModel(config)
# 假设你已经定义了你的模型 model
total_params = sum(p.numel() for p in bert_model.parameters())
print(f"Total parameters: {total_params}")

input_mask=data["input_mask"]
input_ids=data["input_ids"]
token_type_ids=data["segment_ids"]
pooled_output, sequence_output, encoded_layers=bert_model(input_ids,token_type_ids,input_mask)
print(pooled_output.shape,sequence_output.shape,len(encoded_layers))
print(bert_model)

torch.Size([8, 768]) torch.Size([8, 512, 768]) 12
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout

Total parameters: 102267648
