In [1]:
import transformers
print(transformers.__version__)

3.4.0


In [None]:
# pip install transformers==3.4.0

In [2]:
from transformers import BertConfig,BertModel,BertTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [30]:
class Conv1D(nn.Module):
    def __init__(self,in_channels,out_channels,filter_sizes):
        super(Conv1D,self).__init__()
        self.convs=nn.ModuleList([
            nn.Conv1d(in_channels=in_channels,out_channels=out_channels,kernel_size=fs) for fs in filter_sizes
        ])
        self.init_params()
    def init_params(self):
        for m in self.convs:
            nn.init.xavier_normal_(m.weight.data)
            nn.init.constant_(m.bias.data,0.1)
    def forward(self,x):
        return [F.relu(conv(x)) for conv in self.convs]
class BertCNN(nn.Module):
    def __init__(self,config):
        super(BertCNN,self).__init__()
        self.num_labels=config.num_classes
        model_config = BertConfig.from_pretrained(config.bert_path, num_labels=config.num_classes)
        self.bert = BertModel.from_pretrained(config.bert_path,config=model_config)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.dropout=nn.Dropout(config.dropout)
        self.convs=Conv1D(config.hidden_size,config.num_filters,filter_sizes=config.filter_sizes)
        self.classifier=nn.Linear(len(config.filter_sizes)*config.num_filters,self.num_labels)
    def forward(self,x):
        """
        Args:
            input_ids: token_id
            token_type_ids: 0 means first sentence,1 means second sentence
            attention_mask: 1 means token, 0 means padding
        """
        context = x[0]  # 输入的句子
        mask = x[1]  # 对padding部分进行mask，和句子一个size，padding部分用0表示，如：[1, 1, 1, 1, 0, 0]
        token_type_ids = x[2]
        
        encoded_layer,_=self.bert(context,token_type_ids=token_type_ids,attention_mask=mask)
        # why the tensor should be permuted?
        encoded_layer=encoded_layer.permute(0,2,1)
        # conved[0] shape (batch_size,n_filters,-1)
        conved=self.convs(encoded_layer)
        # conved[0] shape (batch_size,n_filters)
        max_pooled=[F.max_pool1d(conv,conv.shape[2]).squeeze(2) for conv in conved]
        # cat shape (batch_size,n_filters*len(filter_sizes))
        cat = self.dropout(torch.cat(max_pooled,dim=1))
        # logits shape (batch_size,num_labels)
        logits = self.classifier(cat)
        return logits
class Config():
    def __init__(self):
        self.bert_path='./home/aistudio/data/data56340'
        self.num_classes=13
        self.filter_sizes=[2,3,4]
        self.num_filters=256
        self.hidden_size=768
        self.dropout=0.1
config=Config()

In [31]:
model=BertCNN(config)

In [32]:
bert_params=[]
other_params=[]
for n,p in model.named_parameters():
    if 'bert' in n:
        bert_params.append((n,p))
    else:
        other_params.append((n,p))
bert_names,bert_parameters=zip(*bert_params)
other_names,other_parameters=zip(*other_params)
print(bert_names)
print(other_names)

('bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.

In [33]:
for n,p in model.bert.named_parameters():
    print(id(p))

140431244056528
140431244056608
140431244058288
140430805711600
140431242384080
140431244086720
140431251917552
140431239103744
140431239896192
140431250767296
140431251885344
140431251882624
140431251883664
140431260968912
140431250728592
140431250730912
140431250727632
140431250308384
140430805064192
140430805063232
140431242429776
140431251106944
140431244546288
140431244548368
140431240889920
140431240888480
140431240889680
140431250319472
140431250320672
140431250319072
140431251191360
140431251192640
140431251190560
140431242525264
140431242525424
140431242523424
140431242523824
140431242526144
140431242524704
140431242524544
140431264294784
140431264294704
140431264295584
140431251208224
140430805423920
140431239525552
140431239526112
140431244056848
140430805063552
140430806093488
140431242349296
140431242348096
140431238966320
140430805770560
140430805151248
140431241280320
140430806170272
140430806171552
140430806169952
140430806169472
140430806170352
140430806168352
14043080

In [34]:
for n,p in model.named_parameters():
    print(id(p))

140431244056528
140431244056608
140431244058288
140430805711600
140431242384080
140431244086720
140431251917552
140431239103744
140431239896192
140431250767296
140431251885344
140431251882624
140431251883664
140431260968912
140431250728592
140431250730912
140431250727632
140431250308384
140430805064192
140430805063232
140431242429776
140431251106944
140431244546288
140431244548368
140431240889920
140431240888480
140431240889680
140431250319472
140431250320672
140431250319072
140431251191360
140431251192640
140431251190560
140431242525264
140431242525424
140431242523424
140431242523824
140431242526144
140431242524704
140431242524544
140431264294784
140431264294704
140431264295584
140431251208224
140430805423920
140431239525552
140431239526112
140431244056848
140430805063552
140430806093488
140431242349296
140431242348096
140431238966320
140430805770560
140430805151248
140431241280320
140430806170272
140430806171552
140430806169952
140430806169472
140430806170352
140430806168352
14043080

In [18]:
bert_config=BertConfig.from_pretrained('./home/aistudio/data/data56340')

In [19]:
bert_tokenizer=BertTokenizer.from_pretrained('./home/aistudio/data/data56340')

In [20]:
bert_model=BertModel.from_pretrained('./home/aistudio/data/data56340')

In [38]:
result=bert_tokenizer.encode_plus(text='今天天气真好，我应该考虑出去走走',add_special_tokens=True,max_length=128,truncation=True)
print(result)

{'input_ids': [101, 791, 1921, 1921, 3698, 4696, 1962, 8024, 2769, 2418, 6421, 5440, 5991, 1139, 1343, 6624, 6624, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [39]:
model((torch.tensor([result['input_ids']]),torch.tensor([result['token_type_ids']]),torch.tensor([result['attention_mask']])))

tensor([[-1.1157,  0.4755, -0.4979, -0.7560, -0.0239,  0.9446, -0.2323, -1.6038,
          0.5537, -0.5903,  1.0308, -0.6387, -0.7466]],
       grad_fn=<AddmmBackward>)

In [40]:
from transformers import get_linear_schedule_with_warmup

In [41]:
get_linear_schedule_with_warmup?

In [42]:
# the following code is mainly used to unzip the required data to specific folder.
import os,zipfile
src_file='chinese_wobert_L-12_H-768_A-12.zip'
zf=zipfile.ZipFile(src_file)
zf.extractall('./home/aistudio/data/wobert')
zf.close

<bound method ZipFile.close of <zipfile.ZipFile filename='chinese_wobert_L-12_H-768_A-12.zip' mode='r'>>

In [43]:
!ls ./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12

bert_config.json		     bert_model.ckpt.index  checkpoint
bert_model.ckpt.data-00000-of-00001  bert_model.ckpt.meta   vocab.txt


In [66]:
# The following code is used to change the pretrained model from tf format to pytorch. 
%run convert_bert_tf_checkpoint_to_pytorch.py --tf_checkpoint_path ./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12/bert_model.ckpt \
  --bert_config_file ./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12/bert_config.json \
  --pytorch_dump_path ./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12/pytorch_model.bin

Building PyTorch model from configuration: BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 33586
}

Save PyTorch model to ./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12/pytorch_model.bin


In [68]:
wobert_dir="./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12"
wobert_config=BertConfig.from_pretrained(wobert_dir)
wobert_tokenizer=BertTokenizer.from_pretrained(wobert_dir)
wobert_model=BertModel.from_pretrained(wobert_dir,config=wobert_config)

## Transformer basic study
We build a bert and cnn combined fine-tuned model on the base of the transformed pretrained bert model from Google, we know how to transform the pretrained model from the format of tf to pt, we learn how to load pretrained model in new defined network for downstream tasks, we understand the name and weights of layers so as to setup different optimizer values for different parameters of layers, the model we build here can used to do text classification.

### Main Content
- build a text classification model by incorporating pretrained model 
- study name and weights of parameters
- study the way of transforming pretrained weights from the format of tf to pt

### Packages
- torch
- transformers
- zipfile

### Important functions
- nn.Module
- nn.Linear()
- nn.parameter()
- nn.ModuleList()
- nn.Conv1d()
- nn.init.xavier_normal_
- nn.init.constant_
- torch.permute()
- torch.nn.functional.max_pool1d()
- torch.cat()
- nn.Dropout()

### Special code
```python
# layer parameters initialization 
def init_params(self):
        for m in self.convs:
            nn.init.xavier_normal_(m.weight.data)
            nn.init.constant_(m.bias.data,0.1)

# forward calculation
def forward(self,x):
        """
        Args:
            input_ids: token_id
            token_type_ids: 0 means first sentence,1 means second sentence
            attention_mask: 1 means token, 0 means padding
        """
        context = x[0]  # 输入的句子
        mask = x[1]  # 对padding部分进行mask，和句子一个size，padding部分用0表示，如：[1, 1, 1, 1, 0, 0]
        token_type_ids = x[2]
        
        encoded_layer,_=self.bert(context,token_type_ids=token_type_ids,attention_mask=mask)
        # why the tensor should be permuted?
        encoded_layer=encoded_layer.permute(0,2,1)
        # conved[0] shape (batch_size,n_filters,-1)
        conved=self.convs(encoded_layer)
        # conved[0] shape (batch_size,n_filters)
        max_pooled=[F.max_pool1d(conv,conv.shape[2]).squeeze(2) for conv in conved]
        # cat shape (batch_size,n_filters*len(filter_sizes))
        cat = self.dropout(torch.cat(max_pooled,dim=1))
        # logits shape (batch_size,num_labels)
        logits = self.classifier(cat)
        return logits

# split none bert parameters from the model
bert_params=[]
other_params=[]
for n,p in model.named_parameters():
    if 'bert' in n:
        bert_params.append((n,p))
    else:
        other_params.append((n,p))
bert_names,bert_parameters=zip(*bert_params)
other_names,other_parameters=zip(*other_params)
print(bert_names)
print(other_names)

# unzip the required data to specific folder.
import os,zipfile
src_file='chinese_wobert_L-12_H-768_A-12.zip'
zf=zipfile.ZipFile(src_file)
zf.extractall('./home/aistudio/data/wobert')
zf.close

# transform pretrained weights from the format of tf to pt
%run convert_bert_tf_checkpoint_to_pytorch.py --tf_checkpoint_path ./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12/bert_model.ckpt \
  --bert_config_file ./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12/bert_config.json \
  --pytorch_dump_path ./home/aistudio/data/wobert/chinese_wobert_L-12_H-768_A-12/pytorch_model.bin
```