In [None]:
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, XLNetTokenizer, TFXLNetModel

### 1. load model

In [None]:
# Blas GEMM launch failed .模型加载时报错，由于gpu显存不足，kill -9 pid
# model = TFBertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
# model = TFBertModel.from_pretrained('bert-base-uncased')
model = TFXLNetModel.from_pretrained('xlnet-base-cased')

### 2. load tokenizer

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [None]:
# tokenizer.save_pretrained('save_model/bert_base_uncased_tokenizer/')
tokenizer.save_pretrained('save_model/xlnet_base_cased_tokenizer/')

### 3. convert text to ids

In [None]:
tokens_ids = tokenizer.encode('Hello, my dog is cute', add_special_tokens=True)
tokens_ids

In [None]:
tf.constant(tokens_ids)

In [None]:
input_ids = tf.constant(tf.constant(tokens_ids))[None, :] # Batch size 1 
input_ids

### 4. run model

[Model Returns](https://huggingface.co/transformers/model_doc/bert.html#tfbertmodel)

:param :obj:`tuple:

**last_hidden_state (tf.Tensor of shape (batch_size, sequence_length, hidden_size)):**

Sequence of hidden-states at the output of the last layer of the model.

**pooler_output (tf.Tensor of shape (batch_size, hidden_size)):**

Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during Bert pretraining. This output is usually not a good summary of the semantic content of the input, you’re often better with averaging or pooling the sequence of hidden-states for the whole input sequence.

**hidden_states (tuple(tf.Tensor), optional, returned when config.output_hidden_states=True):**

tuple of tf.Tensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size).

Hidden-states of the model at the output of each layer plus the initial embedding outputs.

**attentions (tuple(tf.Tensor), optional, returned when config.output_attentions=True):**

tuple of tf.Tensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length):

Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

In [None]:
# sequence_ouput = (batch_size, sequence_length, embedding_dim)
# pooling_output = (batch_size, embedding_dim)
outputs = model(input_ids)
print(f'outputs type:{type(outputs)}\noutputs len: {len(outputs)}')
last_hidden_state = outputs[0]
last_hidden_state

In [None]:
# 第0列
col_0 = last_hidden_state[0][:, 0]
print(np.max(col_0), np.mean(col_0))

In [None]:
max_pool = tf.keras.layers.GlobalMaxPooling1D()(last_hidden_state)
mean_pool = tf.keras.layers.GlobalAveragePooling1D()(last_hidden_state)
print(max_pool[0][0], mean_pool[0][0])

In [None]:
x = tf.keras.layers.Concatenate()([max_pool, mean_pool])
x

In [None]:
tf.keras.layers.Dropout(0.2)(x)

### 5. save model

In [None]:
# model.save_pretrained('save_model/bert_base_uncased_tf2_model/')
model.save_pretrained('save_model/xlnet_base_cased_tf2_model/')