## 模型参数 torch 转换 paddle

In [6]:
import paddle
import torch
import numpy as np

torch_base_model_path = 'pretrained_model/torch/base/pytorch_model.bin'
torch_large_model_path = 'pretrained_model/torch/large/pytorch_model.bin'

paddle_base_model_path = "pretrained_model/paddle/base/model_state.pdparams"
paddle_large_model_path = "pretrained_model/paddle/large/model_state.pdparams"

is_base = True

if is_base:
    torch_model_path = torch_base_model_path
    paddle_model_path = paddle_base_model_path
else:
    torch_model_path = torch_large_model_path
    paddle_model_path = paddle_large_model_path
    
torch_state_dict = torch.load(torch_model_path)

paddle_state_dict = {}

# State_dict's keys mapping: from torch to paddle
keys_dict = {
    # about encoder layer
    'LayerNorm': 'layer_norm',
    'encoder.layer': 'encoder.layers'
}


for torch_key in torch_state_dict:
    paddle_key = torch_key
    for k in keys_dict:
        if k in paddle_key:
            paddle_key = paddle_key.replace(k, keys_dict[k])

    if ('map_fc' in paddle_key) or ('glyph_map' in paddle_key) or ('linear' in paddle_key) or ('proj' in  paddle_key) or ('vocab' in  paddle_key and 'weight' in  paddle_key) or ("dense.weight" in paddle_key) or ('transform.weight' in paddle_key) or ('seq_relationship.weight' in paddle_key):
        print("transpose(permute) ---------->")
        paddle_state_dict[paddle_key] = paddle.to_tensor(torch_state_dict[torch_key].cpu().numpy().transpose())
    else:
        paddle_state_dict[paddle_key] = paddle.to_tensor(torch_state_dict[torch_key].cpu().numpy())

    print("t: ", torch_key,"\t", torch_state_dict[torch_key].shape)
    print("p: ", paddle_key, "\t", paddle_state_dict[paddle_key].shape, "\n")

paddle.save(paddle_state_dict, paddle_model_path)

t:  embeddings.position_ids 	 torch.Size([1, 512])
p:  embeddings.position_ids 	 [1, 512] 

t:  embeddings.word_embeddings.weight 	 torch.Size([32000, 768])
p:  embeddings.word_embeddings.weight 	 [32000, 768] 

t:  embeddings.position_embeddings.weight 	 torch.Size([512, 768])
p:  embeddings.position_embeddings.weight 	 [512, 768] 

t:  embeddings.token_type_embeddings.weight 	 torch.Size([4, 768])
p:  embeddings.token_type_embeddings.weight 	 [4, 768] 

t:  embeddings.LayerNorm.weight 	 torch.Size([768])
p:  embeddings.layer_norm.weight 	 [768] 

t:  embeddings.LayerNorm.bias 	 torch.Size([768])
p:  embeddings.layer_norm.bias 	 [768] 

transpose(permute) ---------->
t:  embeddings.projection.weight 	 torch.Size([768, 768])
p:  embeddings.projection.weight 	 [768, 768] 

transpose(permute) ---------->
t:  embeddings.projection.bias 	 torch.Size([768])
p:  embeddings.projection.bias 	 [768] 

t:  encoder.layer.0.fourier.output.LayerNorm.weight 	 torch.Size([768])
p:  encoder.layers.0.f

## 对比前项精度

### Base

In [7]:
# torch
from transformers import FNetTokenizer, FNetModel
torch_tokenizer = FNetTokenizer.from_pretrained("google/fnet-base")
torch_model = FNetModel.from_pretrained("google/fnet-base")

Some weights of the model checkpoint at google/fnet-base were not used when initializing FNetModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing FNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
text = "Replace me by any text you'd like."

In [9]:
torch_model.eval()
torch_inputs = torch_tokenizer(text, return_tensors="pt")
torch_outputs = torch_model(**torch_inputs)

torch_logits = torch_outputs[0]
torch_array = torch_logits.cpu().detach().numpy()
print("torch_prediction_logits shape:{}".format(torch_array.shape))
print("torch_prediction_logits:{}".format(torch_array))

torch_prediction_logits shape:(1, 12, 768)
torch_prediction_logits:[[[ 4.4527473  -0.10137583 -0.21348645 ...  0.36847726 -0.23560826
   -0.25296995]
  [ 0.18779421 -0.39948907  0.23660113 ...  0.19996837  0.2783861
    0.27940997]
  [ 0.17300639  0.0606944  -0.47870204 ...  0.05200686 -1.295673
    0.578657  ]
  ...
  [ 0.17106001  0.00485597 -0.06762558 ... -0.3548019  -0.82001925
    0.01953557]
  [ 0.06859297  0.23336133 -0.57087415 ... -0.3022466  -0.58877695
   -0.11472143]
  [ 0.24348916  0.1635376   0.36463982 ...  0.5811312  -0.60914195
   -0.32033262]]]


In [10]:
# paddle
import os, sys
import paddle
sys.path.append('/workspace/fnet_paddle/PaddleNLP')
import paddlenlp as ppnlp
torch_large_model_path = 'pretrained_model/torch/large'
paddle_tokenizer = ppnlp.transformers.FNetTokenizer.from_pretrained(torch_large_model_path)

paddle_model = ppnlp.transformers.FNetModel()
params_file_path = "pretrained_model/paddle/base/model_state.pdparams"
param_dict = paddle.load(params_file_path)
paddle_model.load_dict(param_dict)

In [11]:
paddle_model.eval()
paddle_inputs = paddle_tokenizer(text)
paddle_inputs = {k:paddle.to_tensor([v]) for (k, v) in paddle_inputs.items()}
paddle_outputs = paddle_model(**paddle_inputs)

paddle_logits = paddle_outputs[0]
paddle_array = paddle_logits.numpy()
print("paddle_prediction_logits shape:{}".format(paddle_array.shape))
print("paddle_prediction_logits:{}".format(paddle_array))

paddle_prediction_logits shape:(1, 12, 768)
paddle_prediction_logits:[[[ 4.4526024e+00 -1.0144000e-01 -2.1346147e-01 ...  3.6845985e-01
   -2.3564060e-01 -2.5293329e-01]
  [ 1.8808359e-01 -3.9938882e-01  2.3694393e-01 ...  2.0020518e-01
    2.7856782e-01  2.7908224e-01]
  [ 1.7336315e-01  6.0770631e-02 -4.7835657e-01 ...  5.1935472e-02
   -1.2956736e+00  5.7832038e-01]
  ...
  [ 1.7099635e-01  4.0750708e-03 -6.7396842e-02 ... -3.5490695e-01
   -8.1969118e-01  1.9487010e-02]
  [ 6.7856006e-02  2.3314802e-01 -5.7079792e-01 ... -3.0198133e-01
   -5.8851326e-01 -1.1473138e-01]
  [ 2.4360198e-01  1.6350693e-01  3.6484715e-01 ...  5.8138633e-01
   -6.0873306e-01 -3.2016295e-01]]]


In [12]:
import numpy as np
assert torch_array.shape == paddle_array.shape, "the output logits should have the same shape, but got : {} and {} instead".format(torch_array.shape, paddle_array.shape)
diff = torch_array - paddle_array
print(np.amax(abs(diff)))

0.0012811422
