In [1]:
import pickle
from transformers import BertJapaneseTokenizer, BertForMaskedLM
import pandas as pd
import json
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import collections
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import DistilBertConfig

In [2]:
max_seq_length = 110
output_file = "data.tfrecord"

In [3]:
with open(".\orignal_data.pkl", "rb") as myprofile:  
    dt = pickle.load(myprofile)

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
model = TFDistilBertForSequenceClassification.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', num_labels=2)
#.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', num_labels=2)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some layers from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing TFDistilBertForSequenceClassification: ['nsp___cls', 'mlm___cls', 'bert']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the

In [6]:
train_output_file = "train_data.tfrecord"
test_output_file = "test_data.tfrecord"

In [7]:
def select_data_and_label_from_record(record):
    x = {
        "input_ids": record["input_ids"],
        "input_mask": record["input_mask"],
        # 'segment_ids': record['segment_ids']
    }
    y = record["label_ids"]
    return (x, y)
def _decode_record(record, name_to_features):
    """Decodes a record to a TensorFlow example."""
    return tf.io.parse_single_example(record, name_to_features)

In [8]:
def create_train_test_data(file_name,isTrain = False):
    dataset = tf.data.TFRecordDataset(file_name)
    if isTrain :
        dataset = dataset.repeat(500)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    name_to_features = {
            "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            # "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.io.FixedLenFeature([], tf.int64),
        }
    drop_remainder=False
    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=100,
            drop_remainder=drop_remainder,
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )
    )
    dataset.cache()
    re_dataset = dataset.map(select_data_and_label_from_record)
    return re_dataset

In [9]:
train_dataset = create_train_test_data(train_output_file,True)
test_dataset = create_train_test_data(test_output_file)

Instructions for updating:
Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by `tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data optimizations will take care of using the fused implementation.


In [10]:
test_dataset

<MapDataset element_spec=({'input_ids': TensorSpec(shape=(None, 110), dtype=tf.int64, name=None), 'input_mask': TensorSpec(shape=(None, 110), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [11]:
learning_rate = 3e-3
epsilon = 1e-05

In [39]:
class MyModel(tf.keras.Model):
    def __init__(self,model):
        super().__init__()
        self.backup = {}
        #self.model = model.distilbert(input_ids,input_mask)[0]
        self.model = model
        self.model.trainable = False
        self.layersG = tf.keras.layers.GlobalMaxPool1D()
        self.layers1 = tf.keras.layers.Dense(50, activation="relu")
        self.layersD2 = tf.keras.layers.Dropout(0.2)
        self.layers3 = tf.keras.layers.Dense(10, activation="relu")
        self.layersD4 = tf.keras.layers.Dropout(0.2)
        self.layers5 = tf.keras.layers.Dense(2, activation="softmax")
        self.Bidirectional=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(60, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
    def call(self, inputs):
        input_ids = inputs["input_ids"]
        input_mask = inputs["input_mask"]
        embedding_layer  = self.model.distilbert(input_ids,input_mask)[0]
        X =self.Bidirectional(embedding_layer)
        X = self.layersG(X)
        X = self.layers1(X)
        X = self.layersD2(X)
        X = self.layers3(X)
        X = self.layersD4(X)
        X = self.layers5(X)
        return X
    def train_step(self, data):
        if len(data) == 3:
            x, y, sample_weight = data
        else:
            sample_weight = None
            x, y = data
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            # Compute the loss value.
            # The loss function is configured in `compile()`.
            loss = self.compiled_loss(
                y,
                y_pred,
                sample_weight=sample_weight,
                regularization_losses=self.losses,
            )
        #Tensorflow2.0 自定义模型训练实现NLP中的FGM对抗训练 代码实现
        # Compute embedding gradients
        embedding = self.trainable_variables[0]
        embedding_gradients = tape.gradient(loss, embedding)[0]
        embedding_gradients = tf.zeros_like(embedding) + embedding_gradients
        delta = 0.01*embedding_gradients/(tf.math.sqrt(tf.reduce_sum(embedding_gradients**2))+1e-8)
        self.trainable_variables[0].assign_add(delta)
        
        with tf.GradientTape() as tape2:
            y_pred = self(x, training=True)  # Forward pass
            # Compute the loss value.
            # The loss function is configured in `compile()`.
            loss2 = self.compiled_loss(
                y,
                y_pred,
                sample_weight=sample_weight,
                regularization_losses=self.losses,
            )
            
        gradients = tape2.gradient(loss2, self.trainable_variables)
        self.trainable_variables[0].assign_sub(delta)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        # Update the metrics.
        # Metrics are configured in `compile()`.
        self.compiled_metrics.update_state(y, y_pred, sample_weight=sample_weight)

        # Return a dict mapping metric names to current value.
        # Note that it will include the loss (tracked in self.metrics).
        return {m.name: m.result() for m in self.metrics}
        
mode2 = MyModel(model)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)
mode2.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
history = mode2.fit(
    train_dataset,
    steps_per_epoch = 20,
    validation_data=test_dataset,
    validation_steps=5,
    #shuffle=True,
    epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50