## 模型参数 torch 转换 paddle

In [3]:
import paddle
import torch
import numpy as np

torch_base_model_path = 'pretrained_model/torch/base/pytorch_model.bin'
torch_large_model_path = 'pretrained_model/torch/large/pytorch_model.bin'

paddle_base_model_path = "pretrained_model/temp/paddle/base/model_state.pdparams"
paddle_large_model_path = "pretrained_model/temp/paddle/large/model_state.pdparams"

is_base = False

if is_base:
    torch_model_path = torch_base_model_path
    paddle_model_path = paddle_base_model_path
else:
    torch_model_path = torch_large_model_path
    paddle_model_path = paddle_large_model_path
    
torch_state_dict = torch.load(torch_model_path)

paddle_state_dict = {}

# State_dict's keys mapping: from torch to paddle
keys_dict = {
    # about encoder layer
    'LayerNorm': 'layer_norm',
    'encoder.layer': 'encoder.layers'
}


for torch_key in torch_state_dict:
    paddle_key = torch_key
    for k in keys_dict:
        if k in paddle_key:
            paddle_key = paddle_key.replace(k, keys_dict[k])

    if ('map_fc' in paddle_key) or ('glyph_map' in paddle_key) or ('linear' in paddle_key) or ('proj' in  paddle_key) or ('vocab' in  paddle_key and 'weight' in  paddle_key) or ("dense.weight" in paddle_key) or ('transform.weight' in paddle_key) or ('seq_relationship.weight' in paddle_key):
        print("transpose(permute) ---------->")
        paddle_state_dict[paddle_key] = paddle.to_tensor(torch_state_dict[torch_key].cpu().numpy().transpose())
    else:
        paddle_state_dict[paddle_key] = paddle.to_tensor(torch_state_dict[torch_key].cpu().numpy())

    print("t: ", torch_key,"\t", torch_state_dict[torch_key].shape)
    print("p: ", paddle_key, "\t", paddle_state_dict[paddle_key].shape, "\n")

paddle.save(paddle_state_dict, paddle_model_path)

t:  embeddings.position_ids 	 torch.Size([1, 512])
p:  embeddings.position_ids 	 [1, 512] 

t:  embeddings.word_embeddings.weight 	 torch.Size([32000, 1024])
p:  embeddings.word_embeddings.weight 	 [32000, 1024] 

t:  embeddings.position_embeddings.weight 	 torch.Size([512, 1024])
p:  embeddings.position_embeddings.weight 	 [512, 1024] 

t:  embeddings.token_type_embeddings.weight 	 torch.Size([4, 1024])
p:  embeddings.token_type_embeddings.weight 	 [4, 1024] 

t:  embeddings.LayerNorm.weight 	 torch.Size([1024])
p:  embeddings.layer_norm.weight 	 [1024] 

t:  embeddings.LayerNorm.bias 	 torch.Size([1024])
p:  embeddings.layer_norm.bias 	 [1024] 

transpose(permute) ---------->
t:  embeddings.projection.weight 	 torch.Size([1024, 1024])
p:  embeddings.projection.weight 	 [1024, 1024] 

transpose(permute) ---------->
t:  embeddings.projection.bias 	 torch.Size([1024])
p:  embeddings.projection.bias 	 [1024] 

t:  encoder.layer.0.fourier.output.LayerNorm.weight 	 torch.Size([1024])
p:  e

## 对比前项精度

### Base

In [4]:
# torch
from transformers import FNetTokenizer, FNetModel
torch_tokenizer = FNetTokenizer.from_pretrained("google/fnet-base")
torch_model = FNetModel.from_pretrained("google/fnet-base")

Some weights of the model checkpoint at google/fnet-base were not used when initializing FNetModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing FNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
text = "Replace me by any text you'd like."

In [6]:
torch_model.eval()
torch_inputs = torch_tokenizer(text, return_tensors="pt")
torch_outputs = torch_model(**torch_inputs)

torch_logits = torch_outputs[0]
torch_array = torch_logits.cpu().detach().numpy()
print("torch_prediction_logits shape:{}".format(torch_array.shape))
print("torch_prediction_logits:{}".format(torch_array))

torch_prediction_logits shape:(1, 12, 768)
torch_prediction_logits:[[[ 4.4527473  -0.10137583 -0.21348645 ...  0.36847726 -0.23560826
   -0.25296995]
  [ 0.18779421 -0.39948907  0.23660113 ...  0.19996837  0.2783861
    0.27940997]
  [ 0.17300639  0.0606944  -0.47870204 ...  0.05200686 -1.295673
    0.578657  ]
  ...
  [ 0.17106001  0.00485597 -0.06762558 ... -0.3548019  -0.82001925
    0.01953557]
  [ 0.06859297  0.23336133 -0.57087415 ... -0.3022466  -0.58877695
   -0.11472143]
  [ 0.24348916  0.1635376   0.36463982 ...  0.5811312  -0.60914195
   -0.32033262]]]


In [7]:
# paddle
import os, sys
import paddle
sys.path.append('/workspace/fnet_paddle/PaddleNLP')
import paddlenlp as ppnlp
torch_large_model_path = 'pretrained_model/torch/large'
paddle_tokenizer = ppnlp.transformers.FNetTokenizer.from_pretrained(torch_large_model_path)

paddle_model = ppnlp.transformers.FNetModel()
param_dict = paddle.load(paddle_base_model_path)
paddle_model.load_dict(param_dict)

W1127 20:28:01.890157   919 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 6.0, Driver API Version: 11.0, Runtime API Version: 11.0
W1127 20:28:01.894001   919 device_context.cc:465] device: 0, cuDNN Version: 8.0.


In [8]:
paddle_model.eval()
paddle_inputs = paddle_tokenizer(text)
paddle_inputs = {k:paddle.to_tensor([v]) for (k, v) in paddle_inputs.items()}
paddle_outputs = paddle_model(**paddle_inputs)

paddle_logits = paddle_outputs[0]
paddle_array = paddle_logits.numpy()
print("paddle_prediction_logits shape:{}".format(paddle_array.shape))
print("paddle_prediction_logits:{}".format(paddle_array))

paddle_prediction_logits shape:(1, 12, 768)
paddle_prediction_logits:[[[ 4.4526024e+00 -1.0144000e-01 -2.1346147e-01 ...  3.6845985e-01
   -2.3564060e-01 -2.5293329e-01]
  [ 1.8808359e-01 -3.9938882e-01  2.3694393e-01 ...  2.0020518e-01
    2.7856782e-01  2.7908224e-01]
  [ 1.7336315e-01  6.0770631e-02 -4.7835657e-01 ...  5.1935472e-02
   -1.2956736e+00  5.7832038e-01]
  ...
  [ 1.7099635e-01  4.0750708e-03 -6.7396842e-02 ... -3.5490695e-01
   -8.1969118e-01  1.9487010e-02]
  [ 6.7856006e-02  2.3314802e-01 -5.7079792e-01 ... -3.0198133e-01
   -5.8851326e-01 -1.1473138e-01]
  [ 2.4360198e-01  1.6350693e-01  3.6484715e-01 ...  5.8138633e-01
   -6.0873306e-01 -3.2016295e-01]]]


In [9]:
import numpy as np
assert torch_array.shape == paddle_array.shape, "the output logits should have the same shape, but got : {} and {} instead".format(torch_array.shape, paddle_array.shape)
diff = torch_array - paddle_array
print(np.amax(abs(diff)))

0.0012811422


In [19]:
# save pretrained model
base_model = 'pretrained_model/paddle/base/'
paddle_model.save_pretrained(base_model)
paddle_tokenizer.save_pretrained(base_model)

### large

In [12]:
# torch
from transformers import FNetTokenizer, FNetModel
torch_tokenizer = FNetTokenizer.from_pretrained("google/fnet-large")
torch_model = FNetModel.from_pretrained("google/fnet-large")

Some weights of the model checkpoint at google/fnet-large were not used when initializing FNetModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing FNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
text = "Replace me by any text you'd like."

In [14]:
torch_model.eval()
torch_inputs = torch_tokenizer(text, return_tensors="pt")
torch_outputs = torch_model(**torch_inputs)

torch_logits = torch_outputs[0]
torch_array = torch_logits.cpu().detach().numpy()
print("torch_prediction_logits shape:{}".format(torch_array.shape))
print("torch_prediction_logits:{}".format(torch_array))

torch_prediction_logits shape:(1, 12, 1024)
torch_prediction_logits:[[[-0.1167793  -0.44756714 -0.51739615 ... -0.19486761 -0.42286307
   -0.68805295]
  [-0.05607523  0.16956906  0.731847   ... -0.23074943 -0.02952154
    0.42456526]
  [-0.09154158  0.23390733  0.12532961 ... -0.39507478  0.2645798
    0.27237514]
  ...
  [ 0.29037276 -0.2935935  -0.72729474 ... -0.10259533  0.09559726
   -0.27696082]
  [ 0.07809267  0.3788107   0.45179468 ...  0.31190634  0.15828757
    0.17926018]
  [ 0.5804361  -0.18722543  0.3022339  ...  0.6312446   0.5981479
    0.42219177]]]


In [15]:
# paddle
import os, sys
import paddle
sys.path.append('/workspace/fnet_paddle/PaddleNLP')
import paddlenlp as ppnlp
torch_large_model_path = 'pretrained_model/torch/large'
paddle_tokenizer = ppnlp.transformers.FNetTokenizer.from_pretrained(torch_large_model_path)

paddle_model = ppnlp.transformers.FNetModel(hidden_size=1024, num_hidden_layers=24, intermediate_size=4096)
param_dict = paddle.load(paddle_large_model_path)
paddle_model.load_dict(param_dict)

In [16]:
paddle_model.eval()
paddle_inputs = paddle_tokenizer(text)
paddle_inputs = {k:paddle.to_tensor([v]) for (k, v) in paddle_inputs.items()}
paddle_outputs = paddle_model(**paddle_inputs)

paddle_logits = paddle_outputs[0]
paddle_array = paddle_logits.numpy()
print("paddle_prediction_logits shape:{}".format(paddle_array.shape))
print("paddle_prediction_logits:{}".format(paddle_array))

paddle_prediction_logits shape:(1, 12, 1024)
paddle_prediction_logits:[[[-0.11672818 -0.44755128 -0.51742613 ... -0.19497725 -0.4228959
   -0.68803114]
  [-0.05620277  0.16907008  0.73102087 ... -0.23174654 -0.02900139
    0.42496818]
  [-0.09160376  0.23406659  0.12481955 ... -0.39578077  0.26496872
    0.27206653]
  ...
  [ 0.2899852  -0.2933667  -0.72708374 ... -0.10251513  0.09553225
   -0.27669966]
  [ 0.07795223  0.37862352  0.4516726  ...  0.31222463  0.15768367
    0.17931624]
  [ 0.5808299  -0.18676645  0.302684   ...  0.6309175   0.5978787
    0.42150334]]]


In [17]:
import numpy as np
assert torch_array.shape == paddle_array.shape, "the output logits should have the same shape, but got : {} and {} instead".format(torch_array.shape, paddle_array.shape)
diff = torch_array - paddle_array
print(np.amax(abs(diff)))

0.0016028583


In [20]:
# save pretrained model
base_model = 'pretrained_model/paddle/large/'
paddle_model.save_pretrained(base_model)
paddle_tokenizer.save_pretrained(base_model)